From 4f28de5ad0faddd8c54ed28ef479d1741f59df4a Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Sun, 27 Oct 2024 19:43:30 -0700
Subject: [PATCH 001/148] [DeviceASAN] Use device usm to sync asan runtime data
 instead of shared usm

Shared USM has poor performance, change it to device USM will benefit
several benchmarks.
---
 .../layers/sanitizer/asan_interceptor.cpp     | 133 +++++++++---------
 .../layers/sanitizer/asan_interceptor.hpp     |  66 +++++++--
 .../layers/sanitizer/asan_libdevice.hpp       |   3 +-
 source/loader/layers/sanitizer/ur_sanddi.cpp  |   7 +-
 4 files changed, 127 insertions(+), 82 deletions(-)
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..e023208b59 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -243,7 +243,7 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context,
 
 ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
                                                   ur_queue_handle_t Queue,
-                                                  USMLaunchInfo &LaunchInfo) {
+                                                  LaunchInfo &LaunchInfo) {
     auto Context = GetContext(Queue);
     auto Device = GetDevice(Queue);
     auto ContextInfo = getContextInfo(Context);
@@ -268,12 +268,14 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
 
 ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
                                                    ur_queue_handle_t Queue,
-                                                   USMLaunchInfo &LaunchInfo) {
+                                                   LaunchInfo &LaunchInfo) {
     // FIXME: We must use block operation here, until we support urEventSetCallback
     auto Result = getContext()->urDdiTable.Queue.pfnFinish(Queue);
 
+    UR_CALL(LaunchInfo.Data.syncFromDevice(Queue));
+
     if (Result == UR_RESULT_SUCCESS) {
-        for (const auto &AH : LaunchInfo.Data->SanitizerReport) {
+        for (const auto &AH : LaunchInfo.Data.Host.SanitizerReport) {
             if (!AH.Flag) {
                 continue;
             }
@@ -600,7 +602,7 @@ SanitizerInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) {
 ur_result_t SanitizerInterceptor::prepareLaunch(
     std::shared_ptr<ContextInfo> &ContextInfo,
     std::shared_ptr<DeviceInfo> &DeviceInfo, ur_queue_handle_t Queue,
-    ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) {
+    ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) {
 
     do {
         auto KernelInfo = getKernelInfo(Kernel);
@@ -635,27 +637,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
             }
         }
 
-        // Set launch info argument
-        auto ArgNums = GetKernelNumArgs(Kernel);
-        if (ArgNums) {
-            getContext()->logger.debug(
-                "launch_info {} (numLocalArgs={}, localArgs={})",
-                (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs,
-                (void *)LaunchInfo.Data->LocalArgs);
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgNums - 1, nullptr, LaunchInfo.Data);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error("Failed to set launch info: {}",
-                                           URes);
-                return URes;
-            }
-        }
-
-        LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
-        LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
-        LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
-        LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
-
         if (LaunchInfo.LocalWorkSize.empty()) {
             LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
             auto URes =
@@ -682,6 +663,32 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                      LocalWorkSize[Dim];
         }
 
+        // Set launch info argument
+        auto ArgNums = GetKernelNumArgs(Kernel);
+        if (ArgNums == 0) {
+            return UR_RESULT_SUCCESS;
+        }
+
+        LaunchInfo.Data.Host.GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
+        LaunchInfo.Data.Host.GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+        LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type;
+        LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0;
+
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            ContextInfo->Handle, DeviceInfo->Handle, nullptr, nullptr,
+            sizeof(LaunchInfo), (void **)&LaunchInfo.Data.DevicePtr));
+        getContext()->logger.debug(
+            "launch_info {} (numLocalArgs={}, localArgs={})",
+            (void *)LaunchInfo.Data.DevicePtr,
+            LaunchInfo.Data.Host.NumLocalArgs,
+            (void *)LaunchInfo.Data.Host.LocalArgs);
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.DevicePtr);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to set launch info: {}", URes);
+            return URes;
+        }
+
         auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
                                             Device = DeviceInfo->Handle,
                                             Queue](size_t Size, uptr &Ptr) {
@@ -730,7 +737,7 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
                 if (EnqueueAllocateShadowMemory(
                         LocalShadowMemorySize,
-                        LaunchInfo.Data->LocalShadowOffset) !=
+                        LaunchInfo.Data.Host.LocalShadowOffset) !=
                     UR_RESULT_SUCCESS) {
                     getContext()->logger.warning(
                         "Failed to allocate shadow memory for local "
@@ -741,8 +748,8 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                         "Skip checking local memory of kernel <{}>",
                         GetKernelName(Kernel));
                 } else {
-                    LaunchInfo.Data->LocalShadowOffsetEnd =
-                        LaunchInfo.Data->LocalShadowOffset +
+                    LaunchInfo.Data.Host.LocalShadowOffsetEnd =
+                        LaunchInfo.Data.Host.LocalShadowOffset +
                         LocalShadowMemorySize - 1;
 
                     ContextInfo->Stats.UpdateShadowMalloced(
@@ -750,8 +757,8 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
                     getContext()->logger.info(
                         "ShadowMemory(Local, {} - {})",
-                        (void *)LaunchInfo.Data->LocalShadowOffset,
-                        (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
+                        (void *)LaunchInfo.Data.Host.LocalShadowOffset,
+                        (void *)LaunchInfo.Data.Host.LocalShadowOffsetEnd);
                 }
             }
         }
@@ -759,7 +766,7 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
         // Write shadow memory offset for private memory
         if (getOptions().DetectPrivates) {
             if (DeviceInfo->Type == DeviceType::CPU) {
-                LaunchInfo.Data->PrivateShadowOffset =
+                LaunchInfo.Data.Host.PrivateShadowOffset =
                     DeviceInfo->Shadow->ShadowBegin;
             } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
                        DeviceInfo->Type == DeviceType::GPU_DG2) {
@@ -772,7 +779,7 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
                 if (EnqueueAllocateShadowMemory(
                         PrivateShadowMemorySize,
-                        LaunchInfo.Data->PrivateShadowOffset) !=
+                        LaunchInfo.Data.Host.PrivateShadowOffset) !=
                     UR_RESULT_SUCCESS) {
                     getContext()->logger.warning(
                         "Failed to allocate shadow memory for private "
@@ -783,8 +790,8 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                         "Skip checking private memory of kernel <{}>",
                         GetKernelName(Kernel));
                 } else {
-                    LaunchInfo.Data->PrivateShadowOffsetEnd =
-                        LaunchInfo.Data->PrivateShadowOffset +
+                    LaunchInfo.Data.Host.PrivateShadowOffsetEnd =
+                        LaunchInfo.Data.Host.PrivateShadowOffset +
                         PrivateShadowMemorySize - 1;
 
                     ContextInfo->Stats.UpdateShadowMalloced(
@@ -792,11 +799,14 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
                     getContext()->logger.info(
                         "ShadowMemory(Private, {} - {})",
-                        (void *)LaunchInfo.Data->PrivateShadowOffset,
-                        (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
+                        (void *)LaunchInfo.Data.Host.PrivateShadowOffset,
+                        (void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd);
                 }
             }
         }
+
+        // Prepare launch info for device side
+        UR_CALL(LaunchInfo.Data.syncToDevice(Queue));
     } while (false);
 
     return UR_RESULT_SUCCESS;
@@ -848,61 +858,52 @@ ContextInfo::~ContextInfo() {
     }
 }
 
-ur_result_t USMLaunchInfo::initialize() {
-    UR_CALL(getContext()->urDdiTable.Context.pfnRetain(Context));
-    UR_CALL(getContext()->urDdiTable.Device.pfnRetain(Device));
-    UR_CALL(getContext()->urDdiTable.USM.pfnSharedAlloc(
-        Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data));
-    *Data = LaunchInfo{};
-    return UR_RESULT_SUCCESS;
-}
-
-ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) {
-    auto NumArgs = KI.LocalArgs.size();
-    if (NumArgs) {
-        Data->NumLocalArgs = NumArgs;
-        UR_CALL(getContext()->urDdiTable.USM.pfnSharedAlloc(
-            Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs,
-            (void **)&Data->LocalArgs));
-        uint32_t i = 0;
+ur_result_t LaunchInfo::updateKernelInfo(const KernelInfo &KI) {
+    if (!KI.LocalArgs.empty()) {
+        std::vector<LocalArgsInfo> LocalArgsInfo;
         for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) {
-            Data->LocalArgs[i++] = ArgInfo;
+            LocalArgsInfo.push_back(ArgInfo);
             getContext()->logger.debug(
                 "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex,
                 ArgInfo.Size, ArgInfo.SizeWithRedZone);
         }
+        ManagedQueue Queue(Context, Device);
+        UR_CALL(
+            Data.importLocalArgsInfo(Context, Device, Queue, LocalArgsInfo));
     }
     return UR_RESULT_SUCCESS;
 }
 
-USMLaunchInfo::~USMLaunchInfo() {
+LaunchInfo::~LaunchInfo() {
     [[maybe_unused]] ur_result_t Result;
-    if (Data) {
+    if (Data.DevicePtr) {
         auto Type = GetDeviceType(Context, Device);
         auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
         if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) {
-            if (Data->PrivateShadowOffset) {
+            if (Data.Host.PrivateShadowOffset) {
                 ContextInfo->Stats.UpdateShadowFreed(
-                    Data->PrivateShadowOffsetEnd - Data->PrivateShadowOffset +
-                    1);
+                    Data.Host.PrivateShadowOffsetEnd -
+                    Data.Host.PrivateShadowOffset + 1);
                 Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->PrivateShadowOffset);
+                    Context, (void *)Data.Host.PrivateShadowOffset);
                 assert(Result == UR_RESULT_SUCCESS);
             }
-            if (Data->LocalShadowOffset) {
+            if (Data.Host.LocalShadowOffset) {
                 ContextInfo->Stats.UpdateShadowFreed(
-                    Data->LocalShadowOffsetEnd - Data->LocalShadowOffset + 1);
+                    Data.Host.LocalShadowOffsetEnd -
+                    Data.Host.LocalShadowOffset + 1);
                 Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->LocalShadowOffset);
+                    Context, (void *)Data.Host.LocalShadowOffset);
                 assert(Result == UR_RESULT_SUCCESS);
             }
         }
-        if (Data->LocalArgs) {
+        if (Data.Host.LocalArgs) {
             Result = getContext()->urDdiTable.USM.pfnFree(
-                Context, (void *)Data->LocalArgs);
+                Context, (void *)Data.Host.LocalArgs);
             assert(Result == UR_RESULT_SUCCESS);
         }
-        Result = getContext()->urDdiTable.USM.pfnFree(Context, (void *)Data);
+        Result = getContext()->urDdiTable.USM.pfnFree(Context,
+                                                      (void *)Data.DevicePtr);
         assert(Result == UR_RESULT_SUCCESS);
     }
     Result = getContext()->urDdiTable.Context.pfnRelease(Context);
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index e5429acd56..5d70f09cda 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -154,8 +154,50 @@ struct ContextInfo {
     }
 };
 
-struct USMLaunchInfo {
-    LaunchInfo *Data = nullptr;
+struct AsanRuntimeDataWrapper {
+    AsanRuntimeData Host{};
+
+    AsanRuntimeData *DevicePtr = nullptr;
+
+    ur_result_t syncFromDevice(ur_queue_handle_t Queue) {
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            Queue, true, ur_cast<void *>(&Host), DevicePtr,
+            sizeof(AsanRuntimeData), 0, nullptr, nullptr));
+
+        return UR_RESULT_SUCCESS;
+    }
+
+    ur_result_t syncToDevice(ur_queue_handle_t Queue) {
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            Queue, true, DevicePtr, ur_cast<void *>(&Host),
+            sizeof(AsanRuntimeData), 0, nullptr, nullptr));
+
+        return UR_RESULT_SUCCESS;
+    }
+
+    ur_result_t
+    importLocalArgsInfo(ur_context_handle_t Context, ur_device_handle_t Device,
+                        ur_queue_handle_t Queue,
+                        const std::vector<LocalArgsInfo> &LocalArgs) {
+        assert(!LocalArgs.empty());
+
+        Host.NumLocalArgs = LocalArgs.size();
+        const size_t LocalArgsInfoSize =
+            sizeof(LocalArgsInfo) * Host.NumLocalArgs;
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, LocalArgsInfoSize,
+            ur_cast<void **>(&Host.LocalArgs)));
+
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            Queue, true, Host.LocalArgs, &LocalArgs[0], LocalArgsInfoSize, 0,
+            nullptr, nullptr));
+
+        return UR_RESULT_SUCCESS;
+    }
+};
+
+struct LaunchInfo {
+    AsanRuntimeDataWrapper Data{};
 
     ur_context_handle_t Context = nullptr;
     ur_device_handle_t Device = nullptr;
@@ -164,19 +206,23 @@ struct USMLaunchInfo {
     std::vector<size_t> LocalWorkSize;
     uint32_t WorkDim = 0;
 
-    USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
-                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
-                  const size_t *GlobalWorkOffset, uint32_t WorkDim)
+    LaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
+               const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+               const size_t *GlobalWorkOffset, uint32_t WorkDim)
         : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize),
           GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) {
         if (LocalWorkSize) {
             this->LocalWorkSize =
                 std::vector<size_t>(LocalWorkSize, LocalWorkSize + WorkDim);
         }
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Context.pfnRetain(Context);
+        assert(Result == UR_RESULT_SUCCESS);
+        Result = getContext()->urDdiTable.Device.pfnRetain(Device);
+        assert(Result == UR_RESULT_SUCCESS);
     }
-    ~USMLaunchInfo();
+    ~LaunchInfo();
 
-    ur_result_t initialize();
     ur_result_t updateKernelInfo(const KernelInfo &KI);
 };
 
@@ -206,11 +252,11 @@ class SanitizerInterceptor {
 
     ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel,
                                 ur_queue_handle_t Queue,
-                                USMLaunchInfo &LaunchInfo);
+                                LaunchInfo &LaunchInfo);
 
     ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel,
                                  ur_queue_handle_t Queue,
-                                 USMLaunchInfo &LaunchInfo);
+                                 LaunchInfo &LaunchInfo);
 
     ur_result_t insertContext(ur_context_handle_t Context,
                               std::shared_ptr<ContextInfo> &CI);
@@ -285,7 +331,7 @@ class SanitizerInterceptor {
                               std::shared_ptr<DeviceInfo> &DeviceInfo,
                               ur_queue_handle_t Queue,
                               ur_kernel_handle_t Kernel,
-                              USMLaunchInfo &LaunchInfo);
+                              LaunchInfo &LaunchInfo);
 
     ur_result_t allocShadowMemory(ur_context_handle_t Context,
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp
index 8eba929f34..db2df0ff0f 100644
--- a/source/loader/layers/sanitizer/asan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/asan_libdevice.hpp
@@ -71,10 +71,9 @@ struct LocalArgsInfo {
 
 constexpr std::size_t ASAN_MAX_NUM_REPORTS = 10;
 
-struct LaunchInfo {
+struct AsanRuntimeData {
     uintptr_t GlobalShadowOffset = 0;
     uintptr_t GlobalShadowOffsetEnd = 0;
-
     uintptr_t PrivateShadowOffset = 0;
     uintptr_t PrivateShadowOffsetEnd = 0;
 
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 95b1649691..d65a51212b 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -458,10 +458,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     getContext()->logger.debug("==== urEnqueueKernelLaunch");
 
-    USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
-                             pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
-                             workDim);
-    UR_CALL(LaunchInfo.initialize());
+    LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
+                          pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
+                          workDim);
 
     UR_CALL(getContext()->interceptor->preLaunchKernel(hKernel, hQueue,
                                                        LaunchInfo));

From aca9048e3fb4edea236caf2a7a01a05f16af1e99 Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Sun, 27 Oct 2024 20:34:22 -0700
Subject: [PATCH 002/148] format

---
 source/loader/layers/sanitizer/asan_interceptor.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index e023208b59..fbbd1e2067 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -669,8 +669,10 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
             return UR_RESULT_SUCCESS;
         }
 
-        LaunchInfo.Data.Host.GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
-        LaunchInfo.Data.Host.GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+        LaunchInfo.Data.Host.GlobalShadowOffset =
+            DeviceInfo->Shadow->ShadowBegin;
+        LaunchInfo.Data.Host.GlobalShadowOffsetEnd =
+            DeviceInfo->Shadow->ShadowEnd;
         LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type;
         LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0;
 

From a61fe8e0d2ff5b1afc0854f8f52aa87ca04e5e10 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 07:28:00 +0100
Subject: [PATCH 003/148] add option

---
 .../loader/layers/sanitizer/asan_interceptor.cpp | 16 +++++++++-------
 source/loader/layers/sanitizer/asan_options.cpp  |  1 +
 source/loader/layers/sanitizer/asan_options.hpp  |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..4219601895 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -837,13 +837,15 @@ ContextInfo::~ContextInfo() {
         getContext()->urDdiTable.Context.pfnRelease(Handle);
     assert(Result == UR_RESULT_SUCCESS);
 
-    // check memory leaks
-    std::vector<AllocationIterator> AllocInfos =
-        getContext()->interceptor->findAllocInfoByContext(Handle);
-    for (const auto &It : AllocInfos) {
-        const auto &[_, AI] = *It;
-        if (!AI->IsReleased) {
-            ReportMemoryLeak(AI);
+    if (getOptions().DetectLeaks) {
+        // check memory leaks
+        std::vector<AllocationIterator> AllocInfos =
+            getContext()->interceptor->findAllocInfoByContext(Handle);
+        for (const auto &It : AllocInfos) {
+            const auto &[_, AI] = *It;
+            if (!AI->IsReleased) {
+                ReportMemoryLeak(AI);
+            }
         }
     }
 }
diff --git a/source/loader/layers/sanitizer/asan_options.cpp b/source/loader/layers/sanitizer/asan_options.cpp
index 5c42ab8fca..e785cb37b7 100644
--- a/source/loader/layers/sanitizer/asan_options.cpp
+++ b/source/loader/layers/sanitizer/asan_options.cpp
@@ -85,6 +85,7 @@ AsanOptions::AsanOptions() {
     SetBoolOption("detect_locals", DetectLocals);
     SetBoolOption("detect_privates", DetectPrivates);
     SetBoolOption("print_stats", PrintStats);
+    SetBoolOption("detect_leaks", DetectLeaks);
 
     auto KV = OptionsEnvMap->find("quarantine_size_mb");
     if (KV != OptionsEnvMap->end()) {
diff --git a/source/loader/layers/sanitizer/asan_options.hpp b/source/loader/layers/sanitizer/asan_options.hpp
index 4c515e28fe..8a72a14cea 100644
--- a/source/loader/layers/sanitizer/asan_options.hpp
+++ b/source/loader/layers/sanitizer/asan_options.hpp
@@ -25,6 +25,7 @@ struct AsanOptions {
     bool DetectPrivates = true;
     bool PrintStats = false;
     bool DetectKernelArguments = true;
+    bool DetectLeaks = true;
 
     explicit AsanOptions();
 };

From cacc7696ef1045d35bab5c5e0d826debbcf981f3 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 08:05:35 +0100
Subject: [PATCH 004/148] fix

---
 source/loader/layers/sanitizer/asan_interceptor.cpp | 2 +-
 source/loader/layers/sanitizer/asan_interceptor.hpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4219601895..01e37aecdd 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -837,7 +837,7 @@ ContextInfo::~ContextInfo() {
         getContext()->urDdiTable.Context.pfnRelease(Handle);
     assert(Result == UR_RESULT_SUCCESS);
 
-    if (getOptions().DetectLeaks) {
+    if (getContext()->interceptor->getOptions().DetectLeaks) {
         // check memory leaks
         std::vector<AllocationIterator> AllocInfos =
             getContext()->interceptor->findAllocInfoByContext(Handle);
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index e5429acd56..90e0d385b1 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -291,6 +291,7 @@ class SanitizerInterceptor {
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
   private:
+    AsanOptions m_Options;
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;
     ur_shared_mutex m_ContextMapMutex;
@@ -316,8 +317,6 @@ class SanitizerInterceptor {
 
     std::unique_ptr<Quarantine> m_Quarantine;
 
-    AsanOptions m_Options;
-
     std::unordered_set<ur_adapter_handle_t> m_Adapters;
     ur_shared_mutex m_AdaptersMutex;
 };

From 47e04d5f959a578a511b523f1aa1170e9b887ae1 Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Mon, 28 Oct 2024 01:09:27 -0700
Subject: [PATCH 005/148] Refine some code

---
 .../layers/sanitizer/asan_interceptor.cpp     | 113 ++++++++----------
 .../layers/sanitizer/asan_interceptor.hpp     |  40 +++++--
 2 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index fbbd1e2067..73d88a4886 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -248,9 +248,6 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
     auto Device = GetDevice(Queue);
     auto ContextInfo = getContextInfo(Context);
     auto DeviceInfo = getDeviceInfo(Device);
-    auto KernelInfo = getKernelInfo(Kernel);
-
-    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
 
     ManagedQueue InternalQueue(Context, Device);
     if (!InternalQueue) {
@@ -663,12 +660,12 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                      LocalWorkSize[Dim];
         }
 
-        // Set launch info argument
         auto ArgNums = GetKernelNumArgs(Kernel);
         if (ArgNums == 0) {
             return UR_RESULT_SUCCESS;
         }
 
+        // Prepare asan runtime data
         LaunchInfo.Data.Host.GlobalShadowOffset =
             DeviceInfo->Shadow->ShadowBegin;
         LaunchInfo.Data.Host.GlobalShadowOffsetEnd =
@@ -676,21 +673,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
         LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type;
         LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0;
 
-        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
-            ContextInfo->Handle, DeviceInfo->Handle, nullptr, nullptr,
-            sizeof(LaunchInfo), (void **)&LaunchInfo.Data.DevicePtr));
-        getContext()->logger.debug(
-            "launch_info {} (numLocalArgs={}, localArgs={})",
-            (void *)LaunchInfo.Data.DevicePtr,
-            LaunchInfo.Data.Host.NumLocalArgs,
-            (void *)LaunchInfo.Data.Host.LocalArgs);
-        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.DevicePtr);
-        if (URes != UR_RESULT_SUCCESS) {
-            getContext()->logger.error("Failed to set launch info: {}", URes);
-            return URes;
-        }
-
         auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
                                             Device = DeviceInfo->Handle,
                                             Queue](size_t Size, uptr &Ptr) {
@@ -807,8 +789,34 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
             }
         }
 
-        // Prepare launch info for device side
+        // Write local arguments info
+        if (!KernelInfo->LocalArgs.empty()) {
+            std::vector<LocalArgsInfo> LocalArgsInfo;
+            for (auto [ArgIndex, ArgInfo] : KernelInfo->LocalArgs) {
+                LocalArgsInfo.push_back(ArgInfo);
+                getContext()->logger.debug(
+                    "local_args (argIndex={}, size={}, sizeWithRZ={})",
+                    ArgIndex, ArgInfo.Size, ArgInfo.SizeWithRedZone);
+            }
+            UR_CALL(LaunchInfo.Data.importLocalArgsInfo(Queue, LocalArgsInfo));
+        }
+
+        // sync asan runtime data to device side
         UR_CALL(LaunchInfo.Data.syncToDevice(Queue));
+
+        // set kernel argument
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr());
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to set launch info: {}", URes);
+            return URes;
+        }
+
+        getContext()->logger.debug(
+            "launch_info {} (numLocalArgs={}, localArgs={})",
+            (void *)LaunchInfo.Data.getDevicePtr(),
+            LaunchInfo.Data.Host.NumLocalArgs,
+            (void *)LaunchInfo.Data.Host.LocalArgs);
     } while (false);
 
     return UR_RESULT_SUCCESS;
@@ -860,54 +868,39 @@ ContextInfo::~ContextInfo() {
     }
 }
 
-ur_result_t LaunchInfo::updateKernelInfo(const KernelInfo &KI) {
-    if (!KI.LocalArgs.empty()) {
-        std::vector<LocalArgsInfo> LocalArgsInfo;
-        for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) {
-            LocalArgsInfo.push_back(ArgInfo);
-            getContext()->logger.debug(
-                "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex,
-                ArgInfo.Size, ArgInfo.SizeWithRedZone);
-        }
-        ManagedQueue Queue(Context, Device);
-        UR_CALL(
-            Data.importLocalArgsInfo(Context, Device, Queue, LocalArgsInfo));
-    }
-    return UR_RESULT_SUCCESS;
-}
-
-LaunchInfo::~LaunchInfo() {
+AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() {
     [[maybe_unused]] ur_result_t Result;
-    if (Data.DevicePtr) {
-        auto Type = GetDeviceType(Context, Device);
-        auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
-        if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) {
-            if (Data.Host.PrivateShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data.Host.PrivateShadowOffsetEnd -
-                    Data.Host.PrivateShadowOffset + 1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data.Host.PrivateShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
-            if (Data.Host.LocalShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data.Host.LocalShadowOffsetEnd -
-                    Data.Host.LocalShadowOffset + 1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data.Host.LocalShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
+    auto Type = GetDeviceType(Context, Device);
+    auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
+    if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) {
+        if (Host.PrivateShadowOffset) {
+            ContextInfo->Stats.UpdateShadowFreed(Host.PrivateShadowOffsetEnd -
+                                                 Host.PrivateShadowOffset + 1);
+            Result = getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)Host.PrivateShadowOffset);
+            assert(Result == UR_RESULT_SUCCESS);
         }
-        if (Data.Host.LocalArgs) {
+        if (Host.LocalShadowOffset) {
+            ContextInfo->Stats.UpdateShadowFreed(Host.LocalShadowOffsetEnd -
+                                                 Host.LocalShadowOffset + 1);
             Result = getContext()->urDdiTable.USM.pfnFree(
-                Context, (void *)Data.Host.LocalArgs);
+                Context, (void *)Host.LocalShadowOffset);
             assert(Result == UR_RESULT_SUCCESS);
         }
+    }
+    if (Host.LocalArgs) {
         Result = getContext()->urDdiTable.USM.pfnFree(Context,
-                                                      (void *)Data.DevicePtr);
+                                                      (void *)Host.LocalArgs);
         assert(Result == UR_RESULT_SUCCESS);
     }
+    if (DevicePtr) {
+        Result = getContext()->urDdiTable.USM.pfnFree(Context, DevicePtr);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+}
+
+LaunchInfo::~LaunchInfo() {
+    [[maybe_unused]] ur_result_t Result;
     Result = getContext()->urDdiTable.Context.pfnRelease(Context);
     assert(Result == UR_RESULT_SUCCESS);
     Result = getContext()->urDdiTable.Device.pfnRelease(Device);
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index 5d70f09cda..2f2c112eb7 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -159,9 +159,33 @@ struct AsanRuntimeDataWrapper {
 
     AsanRuntimeData *DevicePtr = nullptr;
 
+    ur_context_handle_t Context{};
+
+    ur_device_handle_t Device{};
+
+    AsanRuntimeDataWrapper(ur_context_handle_t Context,
+                           ur_device_handle_t Device)
+        : Context(Context), Device(Device) {}
+
+    ~AsanRuntimeDataWrapper();
+
+    AsanRuntimeData *getDevicePtr() {
+        if (DevicePtr == nullptr) {
+            ur_result_t Result = getContext()->urDdiTable.USM.pfnDeviceAlloc(
+                Context, Device, nullptr, nullptr, sizeof(AsanRuntimeData),
+                (void **)&DevicePtr);
+            if (Result != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to alloc device usm for asan runtime data: {}",
+                    Result);
+            }
+        }
+        return DevicePtr;
+    }
+
     ur_result_t syncFromDevice(ur_queue_handle_t Queue) {
         UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
-            Queue, true, ur_cast<void *>(&Host), DevicePtr,
+            Queue, true, ur_cast<void *>(&Host), getDevicePtr(),
             sizeof(AsanRuntimeData), 0, nullptr, nullptr));
 
         return UR_RESULT_SUCCESS;
@@ -169,15 +193,14 @@ struct AsanRuntimeDataWrapper {
 
     ur_result_t syncToDevice(ur_queue_handle_t Queue) {
         UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
-            Queue, true, DevicePtr, ur_cast<void *>(&Host),
+            Queue, true, getDevicePtr(), ur_cast<void *>(&Host),
             sizeof(AsanRuntimeData), 0, nullptr, nullptr));
 
         return UR_RESULT_SUCCESS;
     }
 
     ur_result_t
-    importLocalArgsInfo(ur_context_handle_t Context, ur_device_handle_t Device,
-                        ur_queue_handle_t Queue,
+    importLocalArgsInfo(ur_queue_handle_t Queue,
                         const std::vector<LocalArgsInfo> &LocalArgs) {
         assert(!LocalArgs.empty());
 
@@ -197,8 +220,6 @@ struct AsanRuntimeDataWrapper {
 };
 
 struct LaunchInfo {
-    AsanRuntimeDataWrapper Data{};
-
     ur_context_handle_t Context = nullptr;
     ur_device_handle_t Device = nullptr;
     const size_t *GlobalWorkSize = nullptr;
@@ -206,11 +227,14 @@ struct LaunchInfo {
     std::vector<size_t> LocalWorkSize;
     uint32_t WorkDim = 0;
 
+    AsanRuntimeDataWrapper Data;
+
     LaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
                const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                const size_t *GlobalWorkOffset, uint32_t WorkDim)
         : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize),
-          GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) {
+          GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim),
+          Data(Context, Device) {
         if (LocalWorkSize) {
             this->LocalWorkSize =
                 std::vector<size_t>(LocalWorkSize, LocalWorkSize + WorkDim);
@@ -222,8 +246,6 @@ struct LaunchInfo {
         assert(Result == UR_RESULT_SUCCESS);
     }
     ~LaunchInfo();
-
-    ur_result_t updateKernelInfo(const KernelInfo &KI);
 };
 
 struct DeviceGlobalInfo {

From 830992005aa8428aee6319708fb26ea0a4cbb758 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Mon, 28 Oct 2024 09:27:31 +0100
Subject: [PATCH 006/148] add comment

---
 source/loader/layers/sanitizer/asan_interceptor.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index 90e0d385b1..cc926ed545 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -291,6 +291,7 @@ class SanitizerInterceptor {
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
   private:
+    // m_Options may be used in other places, place it at the top
     AsanOptions m_Options;
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;

From 9564628a274164ef20ebd86b0806305dd333b289 Mon Sep 17 00:00:00 2001
From: Maosu Zhao <maosu.zhao@intel.com>
Date: Tue, 29 Oct 2024 21:10:10 +0800
Subject: [PATCH 007/148] Fix failures on cpu device

---
 .../layers/sanitizer/asan_interceptor.cpp     | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 73d88a4886..d16bbfca80 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -634,6 +634,20 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
             }
         }
 
+        auto ArgNums = GetKernelNumArgs(Kernel);
+        // We must prepare all kernel args before call
+        // urKernelGetSuggestedLocalWorkSize, otherwise the call will fail on
+        // CPU device.
+        if (ArgNums) {
+            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+                Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr());
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error("Failed to set launch info: {}",
+                                           URes);
+                return URes;
+            }
+        }
+
         if (LaunchInfo.LocalWorkSize.empty()) {
             LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
             auto URes =
@@ -660,11 +674,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                      LocalWorkSize[Dim];
         }
 
-        auto ArgNums = GetKernelNumArgs(Kernel);
-        if (ArgNums == 0) {
-            return UR_RESULT_SUCCESS;
-        }
-
         // Prepare asan runtime data
         LaunchInfo.Data.Host.GlobalShadowOffset =
             DeviceInfo->Shadow->ShadowBegin;
@@ -804,14 +813,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
         // sync asan runtime data to device side
         UR_CALL(LaunchInfo.Data.syncToDevice(Queue));
 
-        // set kernel argument
-        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr());
-        if (URes != UR_RESULT_SUCCESS) {
-            getContext()->logger.error("Failed to set launch info: {}", URes);
-            return URes;
-        }
-
         getContext()->logger.debug(
             "launch_info {} (numLocalArgs={}, localArgs={})",
             (void *)LaunchInfo.Data.getDevicePtr(),

From dc588e483b4f44fee5467737e2b8bde3a9b4650c Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Tue, 29 Oct 2024 19:37:38 -0700
Subject: [PATCH 008/148] [DeviceASAN] Re-use shadow if required size is not
 larger than last one

---
 .../layers/sanitizer/asan_interceptor.cpp     | 149 ++++--------------
 .../loader/layers/sanitizer/asan_shadow.cpp   | 106 +++++++++++--
 .../loader/layers/sanitizer/asan_shadow.hpp   |  32 ++++
 3 files changed, 163 insertions(+), 124 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..fbcc401909 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -682,28 +682,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                      LocalWorkSize[Dim];
         }
 
-        auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
-                                            Device = DeviceInfo->Handle,
-                                            Queue](size_t Size, uptr &Ptr) {
-            void *Allocated = nullptr;
-            auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
-                Context, Device, nullptr, nullptr, Size, &Allocated);
-            if (URes != UR_RESULT_SUCCESS) {
-                return URes;
-            }
-            // Initialize shadow memory
-            URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
-            if (URes != UR_RESULT_SUCCESS) {
-                [[maybe_unused]] auto URes =
-                    getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
-                assert(URes == UR_RESULT_SUCCESS &&
-                       "urUSMFree failed at allocating shadow memory");
-                Allocated = nullptr;
-            }
-            Ptr = (uptr)Allocated;
-            return URes;
-        };
-
         auto LocalMemoryUsage =
             GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
         auto PrivateMemoryUsage =
@@ -715,86 +693,45 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
         // Write shadow memory offset for local memory
         if (getOptions().DetectLocals) {
-            // CPU needn't this
-            if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t LocalMemorySize =
-                    GetDeviceLocalMemorySize(DeviceInfo->Handle);
-                const size_t LocalShadowMemorySize =
-                    (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug(
-                    "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
-                    "LocalShadowMemorySize={})",
-                    NumWG, LocalMemorySize, LocalShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        LocalShadowMemorySize,
-                        LaunchInfo.Data->LocalShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for local "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking local memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->LocalShadowOffsetEnd =
-                        LaunchInfo.Data->LocalShadowOffset +
-                        LocalShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        LocalShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Local, {} - {})",
-                        (void *)LaunchInfo.Data->LocalShadowOffset,
-                        (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
-                }
+            if (DeviceInfo->Shadow->AllocLocalShadow(
+                    Queue, NumWG, LaunchInfo.Data->LocalShadowOffset,
+                    LaunchInfo.Data->LocalShadowOffsetEnd) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for local "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking local memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                getContext()->logger.info(
+                    "ShadowMemory(Local, WorkGroup{}, {} - {})", NumWG,
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
 
         // Write shadow memory offset for private memory
         if (getOptions().DetectPrivates) {
-            if (DeviceInfo->Type == DeviceType::CPU) {
-                LaunchInfo.Data->PrivateShadowOffset =
-                    DeviceInfo->Shadow->ShadowBegin;
-            } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                       DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t PrivateShadowMemorySize =
-                    (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
-                                           "PrivateShadowMemorySize={})",
-                                           NumWG, PrivateShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        PrivateShadowMemorySize,
-                        LaunchInfo.Data->PrivateShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for private "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking private memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->PrivateShadowOffsetEnd =
-                        LaunchInfo.Data->PrivateShadowOffset +
-                        PrivateShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        PrivateShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Private, {} - {})",
-                        (void *)LaunchInfo.Data->PrivateShadowOffset,
-                        (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
-                }
+            if (DeviceInfo->Shadow->AllocPrivateShadow(
+                    Queue, NumWG, LaunchInfo.Data->PrivateShadowOffset,
+                    LaunchInfo.Data->PrivateShadowOffsetEnd) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for private "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking private memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                getContext()->logger.info(
+                    "ShadowMemory(Private, WorkGroup{}, {} - {})", NumWG,
+                    (void *)LaunchInfo.Data->PrivateShadowOffset,
+                    (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
             }
         }
     } while (false);
@@ -878,25 +815,7 @@ ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) {
 USMLaunchInfo::~USMLaunchInfo() {
     [[maybe_unused]] ur_result_t Result;
     if (Data) {
-        auto Type = GetDeviceType(Context, Device);
         auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
-        if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) {
-            if (Data->PrivateShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data->PrivateShadowOffsetEnd - Data->PrivateShadowOffset +
-                    1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->PrivateShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
-            if (Data->LocalShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data->LocalShadowOffsetEnd - Data->LocalShadowOffset + 1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->LocalShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
-        }
         if (Data->LocalArgs) {
             Result = getContext()->urDdiTable.USM.pfnFree(
                 Context, (void *)Data->LocalArgs);
diff --git a/source/loader/layers/sanitizer/asan_shadow.cpp b/source/loader/layers/sanitizer/asan_shadow.cpp
index 629ce3a491..f5800a694c 100644
--- a/source/loader/layers/sanitizer/asan_shadow.cpp
+++ b/source/loader/layers/sanitizer/asan_shadow.cpp
@@ -131,16 +131,23 @@ ur_result_t ShadowMemoryGPU::Setup() {
 }
 
 ur_result_t ShadowMemoryGPU::Destory() {
-    if (ShadowBegin == 0) {
-        return UR_RESULT_SUCCESS;
+    if (PrivateShadowOffset != 0) {
+        UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+            Context, (void *)PrivateShadowOffset));
+        PrivateShadowOffset = 0;
     }
-    static ur_result_t Result = [this]() {
-        auto Result = getContext()->urDdiTable.VirtualMem.pfnFree(
-            Context, (const void *)ShadowBegin, GetShadowSize());
-        getContext()->urDdiTable.Context.pfnRelease(Context);
-        return Result;
-    }();
-    return Result;
+    if (LocalShadowOffset != 0) {
+        UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+            Context, (void *)LocalShadowOffset));
+        LocalShadowOffset = 0;
+    }
+    if (ShadowBegin != 0) {
+        UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree(
+            Context, (const void *)ShadowBegin, GetShadowSize()));
+        UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context));
+        ShadowBegin = ShadowEnd = 0;
+    }
+    return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
@@ -255,6 +262,87 @@ ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
+                                              uint32_t NumWG, uptr &Begin,
+                                              uptr &End) {
+    const size_t LocalMemorySize = GetDeviceLocalMemorySize(Device);
+    const size_t RequiredShadowSize =
+        (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
+    static size_t LastAllocedSize = 0;
+    if (RequiredShadowSize > LastAllocedSize) {
+        auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
+        if (LocalShadowOffset) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)LocalShadowOffset));
+            ContextInfo->Stats.UpdateShadowFreed(LastAllocedSize);
+            LocalShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, RequiredShadowSize,
+            (void **)&LocalShadowOffset));
+
+        // Initialize shadow memory
+        ur_result_t URes = EnqueueUSMBlockingSet(
+            Queue, (void *)LocalShadowOffset, 0, RequiredShadowSize);
+        if (URes != UR_RESULT_SUCCESS) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)LocalShadowOffset));
+            LocalShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        ContextInfo->Stats.UpdateShadowMalloced(RequiredShadowSize);
+
+        LastAllocedSize = RequiredShadowSize;
+    }
+
+    Begin = LocalShadowOffset;
+    End = LocalShadowOffset + RequiredShadowSize - 1;
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ShadowMemoryGPU::AllocPrivateShadow(ur_queue_handle_t Queue,
+                                                uint32_t NumWG, uptr &Begin,
+                                                uptr &End) {
+    const size_t RequiredShadowSize =
+        (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
+    static size_t LastAllocedSize = 0;
+    if (RequiredShadowSize > LastAllocedSize) {
+        auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
+        if (PrivateShadowOffset) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)PrivateShadowOffset));
+            ContextInfo->Stats.UpdateShadowFreed(LastAllocedSize);
+            PrivateShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, RequiredShadowSize,
+            (void **)&PrivateShadowOffset));
+
+        // Initialize shadow memory
+        ur_result_t URes = EnqueueUSMBlockingSet(
+            Queue, (void *)PrivateShadowOffset, 0, RequiredShadowSize);
+        if (URes != UR_RESULT_SUCCESS) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)PrivateShadowOffset));
+            PrivateShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        ContextInfo->Stats.UpdateShadowMalloced(RequiredShadowSize);
+
+        LastAllocedSize = RequiredShadowSize;
+    }
+
+    Begin = PrivateShadowOffset;
+    End = PrivateShadowOffset + RequiredShadowSize - 1;
+    return UR_RESULT_SUCCESS;
+}
+
 uptr ShadowMemoryPVC::MemToShadow(uptr Ptr) {
     if (Ptr & 0xFF00000000000000ULL) { // Device USM
         return ShadowBegin + 0x80000000000ULL +
diff --git a/source/loader/layers/sanitizer/asan_shadow.hpp b/source/loader/layers/sanitizer/asan_shadow.hpp
index 7ae095062a..d6d6e634e6 100644
--- a/source/loader/layers/sanitizer/asan_shadow.hpp
+++ b/source/loader/layers/sanitizer/asan_shadow.hpp
@@ -39,6 +39,14 @@ struct ShadowMemory {
 
     virtual size_t GetShadowSize() = 0;
 
+    virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue,
+                                         uint32_t NumWG, uptr &Begin,
+                                         uptr &End) = 0;
+
+    virtual ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue,
+                                           uint32_t NumWG, uptr &Begin,
+                                           uptr &End) = 0;
+
     ur_context_handle_t Context{};
 
     ur_device_handle_t Device{};
@@ -62,6 +70,20 @@ struct ShadowMemoryCPU final : public ShadowMemory {
                                     uptr Size, u8 Value) override;
 
     size_t GetShadowSize() override { return 0x80000000000ULL; }
+
+    ur_result_t AllocLocalShadow(ur_queue_handle_t, uint32_t, uptr &Begin,
+                                 uptr &End) override {
+        Begin = ShadowBegin;
+        End = ShadowEnd;
+        return UR_RESULT_SUCCESS;
+    }
+
+    ur_result_t AllocPrivateShadow(ur_queue_handle_t, uint32_t, uptr &Begin,
+                                   uptr &End) override {
+        Begin = ShadowBegin;
+        End = ShadowEnd;
+        return UR_RESULT_SUCCESS;
+    }
 };
 
 struct ShadowMemoryGPU : public ShadowMemory {
@@ -76,12 +98,22 @@ struct ShadowMemoryGPU : public ShadowMemory {
 
     ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo> AI) override final;
 
+    ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
+                                 uptr &Begin, uptr &End) override final;
+
+    ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue, uint32_t NumWG,
+                                   uptr &Begin, uptr &End) override final;
+
     ur_mutex VirtualMemMapsMutex;
 
     std::unordered_map<
         uptr, std::pair<ur_physical_mem_handle_t,
                         std::unordered_set<std::shared_ptr<AllocInfo>>>>
         VirtualMemMaps;
+
+    uptr LocalShadowOffset = 0;
+
+    uptr PrivateShadowOffset = 0;
 };
 
 /// Shadow Memory layout of GPU PVC device

From d263f9d0e214c85e97afdba9aa52db5bbf6723ad Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 6 Nov 2024 00:07:16 +0100
Subject: [PATCH 009/148] [Common] fix parseDisjointPoolConfig

returning {} when EnableBuffers is false means
returning a defult-constructed DisjointPoolAllConfigs struct which
has EnableBuffers set to 1. Fix this by always returning properly
constructed DisjointPoolAllConfigs.
---
 source/common/umf_pools/disjoint_pool_config_parser.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp
index f1bb7cd40c..0e82072ae2 100644
--- a/source/common/umf_pools/disjoint_pool_config_parser.cpp
+++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp
@@ -215,6 +215,8 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
         }
     }
 
+    AllConfigs.EnableBuffers = EnableBuffers;
+
     AllConfigs.limits = std::shared_ptr<umf_disjoint_pool_shared_limits_t>(
         umfDisjointPoolSharedLimitsCreate(MaxSize),
         umfDisjointPoolSharedLimitsDestroy);
@@ -224,10 +226,6 @@ DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
         Config.PoolTrace = trace;
     }
 
-    if (!EnableBuffers) {
-        return {};
-    }
-
     if (!trace) {
         return AllConfigs;
     }

From 69966c77a55011961e248ffc4ffa08556083b79d Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Wed, 6 Nov 2024 21:35:06 -0800
Subject: [PATCH 010/148] minor fixes

---
 .../layers/sanitizer/asan_interceptor.cpp     | 21 +++++++++++--------
 .../loader/layers/sanitizer/asan_shadow.cpp   |  1 +
 source/loader/layers/sanitizer/ur_sanddi.cpp  |  2 ++
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..5523a6d2a8 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -215,25 +215,28 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context,
     if (ReleaseList.size()) {
         std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
         for (auto &It : ReleaseList) {
+            auto ToFreeAllocInfo = It->second;
             getContext()->logger.info("Quarantine Free: {}",
-                                      (void *)It->second->AllocBegin);
+                                      (void *)ToFreeAllocInfo->AllocBegin);
 
-            ContextInfo->Stats.UpdateUSMRealFreed(AllocInfo->AllocSize,
-                                                  AllocInfo->getRedzoneSize());
+            ContextInfo->Stats.UpdateUSMRealFreed(
+                ToFreeAllocInfo->AllocSize, ToFreeAllocInfo->getRedzoneSize());
 
-            m_AllocationMap.erase(It);
-            if (AllocInfo->Type == AllocType::HOST_USM) {
+            if (ToFreeAllocInfo->Type == AllocType::HOST_USM) {
                 for (auto &Device : ContextInfo->DeviceList) {
                     UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow(
-                        AllocInfo));
+                        ToFreeAllocInfo));
                 }
             } else {
-                UR_CALL(getDeviceInfo(AllocInfo->Device)
-                            ->Shadow->ReleaseShadow(AllocInfo));
+                UR_CALL(getDeviceInfo(ToFreeAllocInfo->Device)
+                            ->Shadow->ReleaseShadow(ToFreeAllocInfo));
             }
 
             UR_CALL(getContext()->urDdiTable.USM.pfnFree(
-                Context, (void *)(It->second->AllocBegin)));
+                Context, (void *)(ToFreeAllocInfo->AllocBegin)));
+
+            // Erase it at last to avoid use-after-free.
+            m_AllocationMap.erase(It);
         }
     }
     ContextInfo->Stats.UpdateUSMFreed(AllocInfo->AllocSize);
diff --git a/source/loader/layers/sanitizer/asan_shadow.cpp b/source/loader/layers/sanitizer/asan_shadow.cpp
index 629ce3a491..e0be521ffd 100644
--- a/source/loader/layers/sanitizer/asan_shadow.cpp
+++ b/source/loader/layers/sanitizer/asan_shadow.cpp
@@ -249,6 +249,7 @@ ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
             getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
                                        (void *)MappedPtr,
                                        (void *)(MappedPtr + PageSize - 1));
+            VirtualMemMaps.erase(MappedPtr);
         }
     }
 
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 95b1649691..f065a116e4 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -357,6 +357,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink(
 
     UR_CALL(pfnProgramLink(hContext, count, phPrograms, pOptions, phProgram));
 
+    UR_CALL(getContext()->interceptor->insertProgram(*phProgram));
     UR_CALL(getContext()->interceptor->registerProgram(hContext, *phProgram));
 
     return UR_RESULT_SUCCESS;
@@ -388,6 +389,7 @@ ur_result_t UR_APICALL urProgramLinkExp(
     UR_CALL(pfnProgramLinkExp(hContext, numDevices, phDevices, count,
                               phPrograms, pOptions, phProgram));
 
+    UR_CALL(getContext()->interceptor->insertProgram(*phProgram));
     UR_CALL(getContext()->interceptor->registerProgram(hContext, *phProgram));
 
     return UR_RESULT_SUCCESS;

From 1a10dab39cb4f31af904038cfb30cc24e8b157af Mon Sep 17 00:00:00 2001
From: Michael Aziz <michael.aziz@intel.com>
Date: Mon, 11 Nov 2024 11:21:06 -0800
Subject: [PATCH 011/148] Update `SuggestMaxCooperativeGroupCountExp`

Signed-off-by: Michael Aziz <michael.aziz@intel.com>
---
 include/ur_api.h                              | 11 ++++++++---
 include/ur_ddi.h                              |  3 ++-
 include/ur_print.hpp                          | 10 ++++++++--
 scripts/core/exp-cooperative-kernels.yml      | 10 +++++++---
 source/adapters/mock/ur_mockddi.cpp           | 13 +++++++++----
 source/adapters/opencl/kernel.cpp             |  3 ++-
 source/loader/layers/tracing/ur_trcddi.cpp    | 16 +++++++++++-----
 source/loader/layers/validation/ur_valddi.cpp | 17 +++++++++++++----
 source/loader/ur_ldrddi.cpp                   | 13 +++++++++----
 source/loader/ur_libapi.cpp                   | 14 ++++++++++----
 source/ur_api.cpp                             | 11 ++++++++---
 11 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index db1c47b2d5..fd0f963ec4 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9435,13 +9435,17 @@ urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel,     ///< [in] handle of the kernel object
-    size_t localWorkSize,           ///< [in] number of local work-items that will form a work-group when the
-                                    ///< kernel is launched
+    uint32_t workDim,               ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                                    ///< work-items
+    const size_t *pLocalWorkSize,   ///< [in] pointer to an array of workDim unsigned values that specify the
+                                    ///< number of local work-items forming a work-group that will execute the
+                                    ///< kernel function.
     size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
                                     ///< that will be used when the kernel is launched
     uint32_t *pGroupCountRet        ///< [out] pointer to maximum number of groups
@@ -10687,7 +10691,8 @@ typedef struct ur_kernel_set_specialization_constants_params_t {
 ///     allowing the callback the ability to modify the parameter's value
 typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t {
     ur_kernel_handle_t *phKernel;
-    size_t *plocalWorkSize;
+    uint32_t *pworkDim;
+    const size_t **ppLocalWorkSize;
     size_t *pdynamicSharedMemorySize;
     uint32_t **ppGroupCountRet;
 } ur_kernel_suggest_max_cooperative_group_count_exp_params_t;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 80a0003fca..03f5ef6282 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -651,7 +651,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)(
 /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)(
     ur_kernel_handle_t,
-    size_t,
+    uint32_t,
+    const size_t *,
     size_t,
     uint32_t *);
 
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 0439a12642..debc3a945c 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -12215,9 +12215,15 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
                           *(params->phKernel));
 
     os << ", ";
-    os << ".localWorkSize = ";
+    os << ".workDim = ";
+
+    os << *(params->pworkDim);
+
+    os << ", ";
+    os << ".pLocalWorkSize = ";
 
-    os << *(params->plocalWorkSize);
+    ur::details::printPtr(os,
+                          *(params->ppLocalWorkSize));
 
     os << ", ";
     os << ".dynamicSharedMemorySize = ";
diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml
index 941aba29fa..ad3ba0ffba 100644
--- a/scripts/core/exp-cooperative-kernels.yml
+++ b/scripts/core/exp-cooperative-kernels.yml
@@ -78,9 +78,13 @@ params:
     - type: $x_kernel_handle_t
       name: hKernel
       desc: "[in] handle of the kernel object"
-    - type: size_t
-      name: localWorkSize
-      desc: "[in] number of local work-items that will form a work-group when the kernel is launched"
+    - type: uint32_t
+      name: workDim
+      desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items"
+    - type: "const size_t*"
+      name: pLocalWorkSize
+      desc: |
+            [in] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function.
     - type: size_t
       name: dynamicSharedMemorySize
       desc: "[in] size of dynamic shared memory, for each work-group, in bytes, that will be used when the kernel is launched"
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index dea28a4658..3e5f25b437 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10003,9 +10003,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched
@@ -10014,7 +10018,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
-        &hKernel, &localWorkSize, &dynamicSharedMemorySize, &pGroupCountRet};
+        &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
+        &pGroupCountRet};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index 617b6a9b2c..c36f7e9926 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -361,7 +361,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     [[maybe_unused]] ur_kernel_handle_t hKernel,
-    [[maybe_unused]] size_t localWorkSize,
+    [[maybe_unused]] uint32_t workDim,
+    [[maybe_unused]] const size_t *pLocalWorkSize,
     [[maybe_unused]] size_t dynamicSharedMemorySize,
     [[maybe_unused]] uint32_t *pGroupCountRet) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 9cc18c66c4..f8576bbd12 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8585,9 +8585,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched
@@ -8602,7 +8606,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     }
 
     ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
-        &hKernel, &localWorkSize, &dynamicSharedMemorySize, &pGroupCountRet};
+        &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
+        &pGroupCountRet};
     uint64_t instance = getContext()->notify_begin(
         UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
         "urKernelSuggestMaxCooperativeGroupCountExp", &params);
@@ -8611,7 +8616,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     logger.info("   ---> urKernelSuggestMaxCooperativeGroupCountExp\n");
 
     ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
+        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        pGroupCountRet);
 
     getContext()->notify_end(
         UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index ef7bb019ea..a2e48405d9 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9609,9 +9609,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched
@@ -9630,6 +9634,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
+        if (NULL == pLocalWorkSize) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
         if (NULL == pGroupCountRet) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9641,7 +9649,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     }
 
     ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
+        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        pGroupCountRet);
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index a67879a9eb..775c4868f4 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8760,9 +8760,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched
@@ -8785,7 +8789,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
 
     // forward to device-platform
     result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
+        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        pGroupCountRet);
 
     return result;
 }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 574e81103c..34e91ac95f 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8884,13 +8884,18 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched
@@ -8904,7 +8909,8 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     }
 
     return pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
+        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        pGroupCountRet);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 79aadc6090..9ab95cee98 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7534,13 +7534,18 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    size_t
-        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
-                       ///< kernel is launched
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
+                 ///< work-items
+    const size_t *
+        pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of local work-items forming a work-group that will execute the
+    ///< kernel function.
     size_t
         dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
     ///< that will be used when the kernel is launched

From b5cefa48d38274704611e32003c2decd00c59d10 Mon Sep 17 00:00:00 2001
From: Michael Aziz <michael.aziz@intel.com>
Date: Mon, 11 Nov 2024 11:28:09 -0800
Subject: [PATCH 012/148] Update query in adapters

Signed-off-by: Michael Aziz <michael.aziz@intel.com>
---
 source/adapters/cuda/kernel.cpp                    |  6 +++++-
 source/adapters/hip/kernel.cpp                     |  5 +++--
 source/adapters/level_zero/kernel.cpp              | 10 ++++++++--
 source/adapters/level_zero/ur_interface_loader.hpp |  2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 5fb097c304..46c4907d4b 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -190,10 +190,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);
 
+  size_t localWorkSize = pLocalWorkSize[0];
+  localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1);
+  localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1);
+
   // We need to set the active current device for this kernel explicitly here,
   // because the occupancy querying API does not take device parameter.
   ur_device_handle_t Device = hKernel->getProgram()->getDevice();
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index 60931cd014..176a2a495a 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -169,10 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) {
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
   std::ignore = hKernel;
-  std::ignore = localWorkSize;
+  std::ignore = workDim;
+  std::ignore = pLocalWorkSize;
   std::ignore = dynamicSharedMemorySize;
   std::ignore = pGroupCountRet;
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index da9152396a..cc542de620 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -1051,11 +1051,17 @@ ur_result_t urKernelGetNativeHandle(
 }
 
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
-  (void)localWorkSize;
   (void)dynamicSharedMemorySize;
   std::shared_lock<ur_shared_mutex> Guard(hKernel->Mutex);
+
+  uint32_t WG[3];
+  WG[0] = ur_cast<uint32_t>(pLocalWorkSize[0]);
+  WG[1] = workDim >= 2 ? ur_cast<uint32_t>(pLocalWorkSize[1]) : 1;
+  WG[2] = workDim == 3 ? ur_cast<uint32_t>(pLocalWorkSize[2]) : 1;
+  ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2]));
+
   uint32_t TotalGroupCount = 0;
   ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount,
              (hKernel->ZeKernel, &TotalGroupCount));
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 1207f7776b..5b0620b1d1 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -687,7 +687,7 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp(
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet);
 ur_result_t urEnqueueTimestampRecordingExp(
     ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,

From f99adf104ef5ce94d66b6ed4f9dccc450f1f638c Mon Sep 17 00:00:00 2001
From: Michael Aziz <michael.aziz@intel.com>
Date: Tue, 12 Nov 2024 12:42:35 -0800
Subject: [PATCH 013/148] Update query API in L0 V2 adapter

Signed-off-by: Michael Aziz <michael.aziz@intel.com>
---
 source/adapters/level_zero/v2/api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index e4a70df811..593115a99f 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -568,7 +568,7 @@ ur_result_t urCommandBufferCommandGetInfoExp(
 }
 
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
   logger::error("{} function not implemented!", __FUNCTION__);
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;

From 79927e2cb273ad65737b4b8df5296fc1f42542bc Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 11:04:21 +0000
Subject: [PATCH 014/148] [HIP] Implement num regs kernel query

---
 source/adapters/hip/kernel.cpp                   | 6 ++++++
 test/conformance/kernel/kernel_adapter_hip.match | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index 60931cd014..e02616104a 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -225,6 +225,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
     return ReturnValue(hKernel->getProgram());
   case UR_KERNEL_INFO_ATTRIBUTES:
     return ReturnValue("");
+  case UR_KERNEL_INFO_NUM_REGS: {
+    int NumRegs = 0;
+    UR_CHECK_ERROR(hipFuncGetAttribute(&NumRegs, HIP_FUNC_ATTRIBUTE_NUM_REGS,
+                                       hKernel->get()));
+    return ReturnValue(static_cast<uint32_t>(NumRegs));
+  }
   default:
     break;
   }
diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match
index f8ea9e3e99..e940db8331 100644
--- a/test/conformance/kernel/kernel_adapter_hip.match
+++ b/test/conformance/kernel/kernel_adapter_hip.match
@@ -1,5 +1,4 @@
 urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/*
-urKernelGetInfoTest.Success/*__UR_KERNEL_INFO_NUM_REGS
 urKernelSetArgLocalTest.InvalidKernelArgumentIndex/*
 urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/*
 urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/*

From dd4ae76f348bad40f56969a82bef606b2418f74b Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 11:30:31 +0000
Subject: [PATCH 015/148] [HIP] Disable SYCL images by default

Align the HIP adapter with the CUDA adapter and report SYCL images
disabled by default, they're implemented at about the same level as the
ones in CUDA, so there's no reason to treat them differently on HIP.

Also any future work to improve the HIP image support should focus on
bindless images rather than this SYCL image support, so it makes sense
to consider this essentially "deprecated".
---
 source/adapters/cuda/device.cpp |  2 +-
 source/adapters/hip/device.cpp  | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index cb6b757dd3..3f7fdc8c46 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -292,7 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
       logger::always(
           "Images are not fully supported by the CUDA BE, their support is "
           "disabled by default. Their partial support can be activated by "
-          "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
+          "setting UR_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
           "runtime.");
     }
 
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index 5271f73709..3922843ed1 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -12,6 +12,7 @@
 #include "adapter.hpp"
 #include "context.hpp"
 #include "event.hpp"
+#include "logger/ur_logger.hpp"
 
 #include <sstream>
 
@@ -223,7 +224,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(uint64_t{MaxAlloc});
   }
   case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
-    return ReturnValue(ur_bool_t{true});
+    bool Enabled = false;
+
+    if (std::getenv("UR_HIP_ENABLE_IMAGE_SUPPORT") != nullptr) {
+      Enabled = true;
+    } else {
+      logger::always(
+          "Images are not fully supported by the HIP BE, their support is "
+          "disabled by default. Their partial support can be activated by "
+          "setting UR_HIP_ENABLE_IMAGE_SUPPORT environment variable at "
+          "runtime.");
+    }
+
+    return ReturnValue(Enabled);
   }
   case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
     // This call doesn't match to HIP as it doesn't have images, but instead

From cf524596eafb3ccee9b21ea916446b67d8b61222 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 11:47:15 +0000
Subject: [PATCH 016/148] [HIP] Fix error code for unsupported program info

Unsupported program info should return unsupported enumeration instead
of unsupported feature.

This aligns with the CUDA adapter and makes the CTS understand that
these properties aren't failing but just unsupported.
---
 source/adapters/hip/program.cpp                    | 5 ++++-
 test/conformance/program/program_adapter_hip.match | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index 4c4f2b2766..d7b7be10fe 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -408,7 +408,10 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_BINARIES:
     return ReturnValue(&hProgram->Binary, 1);
   case UR_PROGRAM_INFO_KERNEL_NAMES:
-    return getKernelNames(hProgram);
+    /* TODO: Add implementation for getKernelNames */
+    UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_PROGRAM_INFO_NUM_KERNELS:
   case UR_PROGRAM_INFO_IL:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match
index 69fe6ac1bb..2bf556541d 100644
--- a/test/conformance/program/program_adapter_hip.match
+++ b/test/conformance/program/program_adapter_hip.match
@@ -1,9 +1,6 @@
 urProgramBuildTest.BuildFailure/*
 # HIP hasn't implemented urProgramCreateWithNativeHandleTest
 {{OPT}}urProgramCreateWithNativeHandleTest.Success/*
-# HIP doesn't expose kernel numbers or names
-urProgramGetInfoTest.Success/*__UR_PROGRAM_INFO_NUM_KERNELS
-urProgramGetInfoTest.Success/*__UR_PROGRAM_INFO_KERNEL_NAMES
 
 # HIP hasn't implemented urProgramLink
 {{OPT}}urProgramLinkTest.Success/*

From 1b373f83c71eb39d440ca2c1ed82faf3bfa9b720 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 11:55:42 +0000
Subject: [PATCH 017/148] [HIP] Update match files for disabled images

---
 test/conformance/memory/memory_adapter_hip.match | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/conformance/memory/memory_adapter_hip.match b/test/conformance/memory/memory_adapter_hip.match
index a4181fcc8a..2acbdfb2e8 100644
--- a/test/conformance/memory/memory_adapter_hip.match
+++ b/test/conformance/memory/memory_adapter_hip.match
@@ -1,6 +1,4 @@
 urMemImageCreateTest.InvalidSize/*
-urMemImageGetInfoTest.Success/*
 urMemBufferCreateWithNativeHandleTest.Success/*
 urMemBufferCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/*
 urMemBufferCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/*
-urMemImageCreateWithNativeHandleTest.Success/*

From 8644396213b31979c7744fbf26deb105afed5ed0 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 14:10:17 +0000
Subject: [PATCH 018/148] [CUDA][HIP] Cleanup KERNEL_NAMES property

No need to keep an empty function, it can always be added if we need it
in the future.
---
 source/adapters/cuda/program.cpp | 16 +++-------------
 source/adapters/hip/program.cpp  | 14 +++-----------
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 4b963a737a..dfd5e9e6b8 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -176,17 +176,6 @@ ur_result_t ur_program_handle_t_::getGlobalVariablePointer(
   return UR_RESULT_SUCCESS;
 }
 
-/// Finds kernel names by searching for entry points in the PTX source, as the
-/// CUDA driver API doesn't expose an operation for this.
-/// Note: This is currently only being used by the SYCL program class for the
-///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to UR and use cuModuleGetFunction to check for a kernel.
-/// Note: Another alternative is to add kernel names as metadata, like with
-///       reqd_work_group_size.
-ur_result_t getKernelNames(ur_program_handle_t) {
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 /// Loads images from a list of PTX or CUBIN binaries.
 /// Note: No calls to CUDA driver API in this function, only store binaries
 /// for later.
@@ -421,8 +410,9 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_BINARIES:
     return ReturnValue(&hProgram->Binary, 1);
   case UR_PROGRAM_INFO_KERNEL_NAMES:
-    /* TODO: Add implementation for getKernelNames */
-    UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
+    // CUDA has no way to query a list of kernels from a binary.
+    // In SYCL this is only used in kernel bundle when building from source
+    // which isn't currently supported for CUDA.
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_PROGRAM_INFO_NUM_KERNELS:
   case UR_PROGRAM_INFO_IL:
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index d7b7be10fe..eae3fda366 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -259,15 +259,6 @@ ur_result_t ur_program_handle_t_::getGlobalVariablePointer(
   return UR_RESULT_SUCCESS;
 }
 
-/// Finds kernel names by searching for entry points in the PTX source, as the
-/// HIP driver API doesn't expose an operation for this.
-/// Note: This is currently only being used by the SYCL program class for the
-///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to UR and use hipModuleGetFunction to check for a kernel.
-ur_result_t getKernelNames(ur_program_handle_t) {
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 /// A program must be specific to a device so this entry point is UNSUPPORTED
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCreateWithIL(ur_context_handle_t, const void *, size_t,
@@ -408,8 +399,9 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_BINARIES:
     return ReturnValue(&hProgram->Binary, 1);
   case UR_PROGRAM_INFO_KERNEL_NAMES:
-    /* TODO: Add implementation for getKernelNames */
-    UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
+    // HIP has no way to query a list of kernels from a binary.
+    // In SYCL this is only used in kernel bundle when building from source
+    // which isn't currently supported for HIP.
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_PROGRAM_INFO_NUM_KERNELS:
   case UR_PROGRAM_INFO_IL:

From 31210f0431ca397d879395bad00292490f7f7f95 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 25 Nov 2024 22:42:54 +0000
Subject: [PATCH 019/148] [Benchmarks] Add MemcpyExecute scenario with high ops
 count

to measure API overhead
---
 scripts/benchmarks/benches/compute.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 57bed7624a..8916f52f53 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -59,14 +59,16 @@ def benchmarks(self) -> list[Benchmark]:
             ExecImmediateCopyQueue(self, 0, 1, 'Device', 'Device', 1024),
             ExecImmediateCopyQueue(self, 1, 1, 'Device', 'Host', 1024),
             VectorSum(self),
-            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1),
-            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
+            MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
+            MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
         ]
 
         if options.ur is not None:
@@ -267,22 +269,23 @@ def bin_args(self) -> list[str]:
         ]
 
 class MemcpyExecute(ComputeBenchmark):
-    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM):
+    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM, useEvent):
         self.numOpsPerThread = numOpsPerThread
         self.numThreads = numThreads
         self.allocSize = allocSize
         self.iterations = iterations
         self.srcUSM = srcUSM
         self.dstUSM = dstUSM
+        self.useEvents = useEvent
         super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute")
 
     def name(self):
-        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}"
+        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}" + (" without events" if not self.useEvents else "")
 
     def bin_args(self) -> list[str]:
         return [
             "--Ioq=1",
-            "--UseEvents=1",
+            f"--UseEvents={self.useEvents}",
             "--MeasureCompletion=1",
             "--UseQueuePerThread=1",
             f"--AllocSize={self.allocSize}",

From caa7e30b22d4fe0907ce742f626977fef6298b76 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Tue, 19 Nov 2024 15:04:33 +0100
Subject: [PATCH 020/148] [L0 v2] fix urMemGetNativeHandle

When called twice on the same handle, urMemGetNativeHandle caused
a null dereference on an unset device pointer.
---
 source/adapters/level_zero/v2/memory.cpp      |  9 ++++----
 .../memory/memory_adapter_level_zero_v2.match |  5 -----
 .../memory/urMemGetNativeHandle.cpp           | 22 +++++++++++++++++++
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/source/adapters/level_zero/v2/memory.cpp b/source/adapters/level_zero/v2/memory.cpp
index 65972f3aff..3e885d5f6c 100644
--- a/source/adapters/level_zero/v2/memory.cpp
+++ b/source/adapters/level_zero/v2/memory.cpp
@@ -260,7 +260,6 @@ void *ur_discrete_mem_handle_t::getDevicePtr(
   std::ignore = access;
   std::ignore = size;
   std::ignore = migrate;
-
   if (!activeAllocationDevice) {
     if (!hDevice) {
       hDevice = hContext->getDevices()[0];
@@ -269,6 +268,10 @@ void *ur_discrete_mem_handle_t::getDevicePtr(
     allocateOnDevice(hDevice, getSize());
   }
 
+  if (!hDevice) {
+    hDevice = activeAllocationDevice;
+  }
+
   char *ptr;
   if (activeAllocationDevice == hDevice) {
     ptr = ur_cast<char *>(deviceAllocations[hDevice->Id.value()].get());
@@ -559,12 +562,10 @@ ur_result_t urMemRelease(ur_mem_handle_t hMem) try {
 ur_result_t urMemGetNativeHandle(ur_mem_handle_t hMem,
                                  ur_device_handle_t hDevice,
                                  ur_native_handle_t *phNativeMem) try {
-  std::ignore = hDevice;
-
   std::scoped_lock<ur_shared_mutex> lock(hMem->getMutex());
 
   auto ptr = hMem->getDevicePtr(
-      nullptr, ur_mem_handle_t_::device_access_mode_t::read_write, 0,
+      hDevice, ur_mem_handle_t_::device_access_mode_t::read_write, 0,
       hMem->getSize(), nullptr);
   *phNativeMem = reinterpret_cast<ur_native_handle_t>(ptr);
   return UR_RESULT_SUCCESS;
diff --git a/test/conformance/memory/memory_adapter_level_zero_v2.match b/test/conformance/memory/memory_adapter_level_zero_v2.match
index d2f34a947d..ec09c7b5ef 100644
--- a/test/conformance/memory/memory_adapter_level_zero_v2.match
+++ b/test/conformance/memory/memory_adapter_level_zero_v2.match
@@ -10,11 +10,6 @@
 {{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/*
 {{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/*
 {{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/*
-{{OPT}}urMemBufferCreateWithNativeHandleTest.Success/*
-{{OPT}}urMemBufferCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/*
-{{OPT}}urMemBufferCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/*
-{{OPT}}urMemBufferCreateWithNativeHandleTest.InvalidNullHandle/*
-{{OPT}}urMemBufferCreateWithNativeHandleTest.InvalidNullPointer/*
 {{OPT}}urMemImageCreateWithNativeHandleTest.Success/*
 {{OPT}}urMemImageCreateWithNativeHandleTest.InvalidNullHandle/*
 {{OPT}}urMemImageCreateWithNativeHandleTest.InvalidNullPointer/*
diff --git a/test/conformance/memory/urMemGetNativeHandle.cpp b/test/conformance/memory/urMemGetNativeHandle.cpp
index 76b8454ab5..4990d90929 100644
--- a/test/conformance/memory/urMemGetNativeHandle.cpp
+++ b/test/conformance/memory/urMemGetNativeHandle.cpp
@@ -2,6 +2,8 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "ur_api.h"
+#include <gtest/gtest.h>
 #include <uur/fixtures.h>
 
 using urMemGetNativeHandleTest = uur::urMemBufferTest;
@@ -14,6 +16,26 @@ TEST_P(urMemGetNativeHandleTest, Success) {
     }
 }
 
+TEST_P(urMemGetNativeHandleTest, SuccessNullDeviceTwice) {
+    ur_native_handle_t hNativeMem = 0;
+    if (auto error = urMemGetNativeHandle(buffer, nullptr, &hNativeMem)) {
+        ASSERT_TRUE(error == UR_RESULT_ERROR_UNSUPPORTED_FEATURE ||
+                    error == UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    }
+    if (auto error = urMemGetNativeHandle(buffer, nullptr, &hNativeMem)) {
+        ASSERT_TRUE(error == UR_RESULT_ERROR_UNSUPPORTED_FEATURE ||
+                    error == UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    }
+}
+
+TEST_P(urMemGetNativeHandleTest, SuccessNullDevice) {
+    ur_native_handle_t hNativeMem = 0;
+    if (auto error = urMemGetNativeHandle(buffer, nullptr, &hNativeMem)) {
+        ASSERT_TRUE(error == UR_RESULT_ERROR_UNSUPPORTED_FEATURE ||
+                    error == UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    }
+}
+
 TEST_P(urMemGetNativeHandleTest, InvalidNullHandleMem) {
     ur_native_handle_t phNativeMem;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,

From eb2c3ce5f93b8d91fe028a744b524ec18192d9ae Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Tue, 26 Nov 2024 15:41:53 +0100
Subject: [PATCH 021/148] [L0] fix use-after-free in urProgramLinkExp

Found by memcheck. The BuildFlagPtr vector was being
populated by the backing data of temporarily created
std::string.
---
 source/adapters/level_zero/program.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index 6ca6e94b25..1b66d12a30 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -452,9 +452,11 @@ ur_result_t urProgramLinkExp(
       // Build flags may be different for different devices, so handle them
       // here. Clear values of the previous device first.
       BuildFlagPtrs.clear();
+      std::vector<std::string> TemporaryOptionsStrings;
       for (uint32_t I = 0; I < count; I++) {
-        BuildFlagPtrs.push_back(
-            phPrograms[I]->getBuildOptions(ZeDevice).c_str());
+        TemporaryOptionsStrings.push_back(
+            phPrograms[I]->getBuildOptions(ZeDevice));
+        BuildFlagPtrs.push_back(TemporaryOptionsStrings.back().c_str());
       }
       ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
       if (count == 1)

From 3d2b7210e35227475eae36f248c21551f3e2fc41 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Tue, 19 Nov 2024 16:56:44 +0000
Subject: [PATCH 022/148] Fix a number of issues from the latest coverity scan.

---
 source/adapters/cuda/common.cpp               |  8 +++---
 source/adapters/hip/common.cpp                |  4 ++-
 source/adapters/level_zero/common.cpp         |  8 +++---
 source/adapters/native_cpu/common.cpp         |  8 +++---
 source/adapters/opencl/common.cpp             |  9 ++++---
 .../layers/sanitizer/asan/asan_report.cpp     |  7 +++---
 .../sanitizer_common/sanitizer_stacktrace.cpp |  2 +-
 test/conformance/source/environment.cpp       |  4 +--
 test/conformance/usm/urUSMFree.cpp            | 18 ++++++-------
 test/conformance/usm/urUSMSharedAlloc.cpp     | 25 +++++++------------
 10 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/source/adapters/cuda/common.cpp b/source/adapters/cuda/common.cpp
index f2b6dae841..89500d1a1c 100644
--- a/source/adapters/cuda/common.cpp
+++ b/source/adapters/cuda/common.cpp
@@ -104,13 +104,15 @@ void detail::ur::assertion(bool Condition, const char *Message) {
 
 // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
+thread_local char ErrorMessage[MaxMessageSize]{};
 
 // Utility function for setting a message and warning
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode) {
-  assert(strlen(pMessage) <= MaxMessageSize);
-  strcpy(ErrorMessage, pMessage);
+  assert(strlen(pMessage) < MaxMessageSize);
+  // Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
+  // always null terminated.
+  strncpy(ErrorMessage, pMessage, MaxMessageSize - 1);
   ErrorMessageCode = ErrorCode;
 }
 
diff --git a/source/adapters/hip/common.cpp b/source/adapters/hip/common.cpp
index da8cc2c765..23ef1d3301 100644
--- a/source/adapters/hip/common.cpp
+++ b/source/adapters/hip/common.cpp
@@ -164,12 +164,14 @@ void detail::ur::assertion(bool Condition, const char *pMessage) {
 
 // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
+thread_local char ErrorMessage[MaxMessageSize]{};
 
 // Utility function for setting a message and warning
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode) {
   assert(strlen(pMessage) < MaxMessageSize);
+  // Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
+  // always null terminated.
   strncpy(ErrorMessage, pMessage, MaxMessageSize - 1);
   ErrorMessageCode = ErrorCode;
 }
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index f5d8b20014..371ed78aeb 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -332,15 +332,17 @@ template <> zes_structure_type_t getZesStructureType<zes_mem_properties_t>() {
 
 // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
+thread_local char ErrorMessage[MaxMessageSize]{};
 thread_local int32_t ErrorAdapterNativeCode;
 
 // Utility function for setting a message and warning
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode,
                                       int32_t AdapterErrorCode) {
-  assert(strlen(pMessage) <= MaxMessageSize);
-  strcpy(ErrorMessage, pMessage);
+  assert(strlen(pMessage) < MaxMessageSize);
+  // Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
+  // always null terminated.
+  strncpy(ErrorMessage, pMessage, MaxMessageSize - 1);
   ErrorMessageCode = ErrorCode;
   ErrorAdapterNativeCode = AdapterErrorCode;
 }
diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp
index b956fc8c7a..ab7c7a07ea 100644
--- a/source/adapters/native_cpu/common.cpp
+++ b/source/adapters/native_cpu/common.cpp
@@ -13,13 +13,15 @@
 // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR
 // See urGetLastResult
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
+thread_local char ErrorMessage[MaxMessageSize]{};
 
 // Utility function for setting a message and warning
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode) {
-  assert(strlen(pMessage) <= MaxMessageSize);
-  strcpy(ErrorMessage, pMessage);
+  assert(strlen(pMessage) < MaxMessageSize);
+  // Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
+  // always null terminated.
+  strncpy(ErrorMessage, pMessage, MaxMessageSize - 1);
   ErrorMessageCode = ErrorCode;
 }
 
diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp
index d6e934c68b..33da43a182 100644
--- a/source/adapters/opencl/common.cpp
+++ b/source/adapters/opencl/common.cpp
@@ -14,11 +14,14 @@ namespace cl_adapter {
 
 /* Global variables for urAdapterGetLastError() */
 thread_local int32_t ErrorMessageCode = 0;
-thread_local char ErrorMessage[MaxMessageSize];
+thread_local char ErrorMessage[MaxMessageSize]{};
 
 [[maybe_unused]] void setErrorMessage(const char *Message, int32_t ErrorCode) {
-  assert(strlen(Message) <= cl_adapter::MaxMessageSize);
-  strcpy(cl_adapter::ErrorMessage, Message);
+  assert(strlen(Message) < cl_adapter::MaxMessageSize);
+  // Copy at most MaxMessageSize - 1 bytes to ensure the resultant string is
+  // always null terminated.
+  strncpy(cl_adapter::ErrorMessage, Message, MaxMessageSize - 1);
+
   ErrorMessageCode = ErrorCode;
 }
 } // namespace cl_adapter
diff --git a/source/loader/layers/sanitizer/asan/asan_report.cpp b/source/loader/layers/sanitizer/asan/asan_report.cpp
index fe7c1e0f87..6300e63e2b 100644
--- a/source/loader/layers/sanitizer/asan/asan_report.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_report.cpp
@@ -47,11 +47,10 @@ void ReportBadFree(uptr Addr, const StackTrace &stack,
     if (!AI) {
         getContext()->logger.always("{} may be allocated on Host Memory",
                                     (void *)Addr);
+    } else {
+        assert(!AI->IsReleased && "Chunk must be not released");
+        PrintAllocateInfo(Addr, AI.get());
     }
-
-    assert(AI && !AI->IsReleased && "Chunk must be not released");
-
-    PrintAllocateInfo(Addr, AI.get());
 }
 
 void ReportBadContext(uptr Addr, const StackTrace &stack,
diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_stacktrace.cpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_stacktrace.cpp
index 357eff4b77..65b0e318dc 100644
--- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_stacktrace.cpp
@@ -31,7 +31,7 @@ bool Contains(const std::string &s, const char *p) {
 
 // Parse back trace information in the following formats:
 //   <module_name>([function_name]+function_offset) [offset]
-void ParseBacktraceInfo(BacktraceInfo BI, std::string &ModuleName,
+void ParseBacktraceInfo(const BacktraceInfo &BI, std::string &ModuleName,
                         uptr &Offset) {
     // Parse module name
     size_t End = BI.find_first_of('(');
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index 006ea09b8b..cc64ab11ea 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -222,7 +222,7 @@ void uur::PlatformEnvironment::selectPlatformFromOptions() {
         std::stringstream errstr;
         errstr << "Multiple possible platforms found; please select one of the "
                   "following or set --platforms_count=1:\n";
-        for (auto p : platforms_filtered) {
+        for (const auto &p : platforms_filtered) {
             errstr << "  --backend=" << backend_to_str(p.backend)
                    << " --platform=\"" << p.name << "\"\n";
         }
@@ -287,7 +287,7 @@ PlatformEnvironment::parsePlatformOptions(int argc, char **argv) {
         } else if (std::strncmp(arg, "--backend=", sizeof("--backend=") - 1) ==
                    0) {
             std::string backend_string{&arg[std::strlen("--backend=")]};
-            if (!parse_backend(backend_string)) {
+            if (!parse_backend(std::move(backend_string))) {
                 return options;
             }
         } else if (std::strncmp(arg, "--platforms_count=",
diff --git a/test/conformance/usm/urUSMFree.cpp b/test/conformance/usm/urUSMFree.cpp
index f5502c89a6..a670c26f4b 100644
--- a/test/conformance/usm/urUSMFree.cpp
+++ b/test/conformance/usm/urUSMFree.cpp
@@ -19,16 +19,16 @@ TEST_P(urUSMFreeTest, SuccessDeviceAlloc) {
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
                                     allocation_size, &ptr));
+    ASSERT_NE(ptr, nullptr);
 
     ur_event_handle_t event = nullptr;
 
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
     EXPECT_SUCCESS(urQueueFlush(queue));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
-    ASSERT_NE(ptr, nullptr);
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     ASSERT_SUCCESS(urEventRelease(event));
 }
@@ -43,15 +43,15 @@ TEST_P(urUSMFreeTest, SuccessHostAlloc) {
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(
         urUSMHostAlloc(context, nullptr, nullptr, allocation_size, &ptr));
+    ASSERT_NE(ptr, nullptr);
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
     EXPECT_SUCCESS(urQueueFlush(queue));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
-    ASSERT_NE(ptr, nullptr);
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     ASSERT_SUCCESS(urEventRelease(event));
 }
@@ -73,15 +73,15 @@ TEST_P(urUSMFreeTest, SuccessSharedAlloc) {
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
                                     allocation_size, &ptr));
+    ASSERT_NE(ptr, nullptr);
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
     EXPECT_SUCCESS(urQueueFlush(queue));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
-    ASSERT_NE(ptr, nullptr);
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     ASSERT_SUCCESS(urEventRelease(event));
 }
diff --git a/test/conformance/usm/urUSMSharedAlloc.cpp b/test/conformance/usm/urUSMSharedAlloc.cpp
index e543602fbc..021fa6368e 100644
--- a/test/conformance/usm/urUSMSharedAlloc.cpp
+++ b/test/conformance/usm/urUSMSharedAlloc.cpp
@@ -44,6 +44,7 @@ struct urUSMSharedAllocTest
 
     ur_usm_pool_handle_t pool = nullptr;
     bool usePool = std::get<0>(getParam()).value;
+    void *ptr = nullptr;
 };
 
 // The 0 value parameters are not relevant for urUSMSharedAllocTest tests, they
@@ -57,7 +58,6 @@ UUR_TEST_SUITE_P(
     uur::printUSMAllocTestString<urUSMSharedAllocTest>);
 
 TEST_P(urUSMSharedAllocTest, Success) {
-    void *ptr = nullptr;
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, pool,
                                     allocation_size, &ptr));
@@ -65,9 +65,9 @@ TEST_P(urUSMSharedAllocTest, Success) {
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     EXPECT_SUCCESS(urEventRelease(event));
@@ -85,7 +85,6 @@ TEST_P(urUSMSharedAllocTest, SuccessWithDescriptors) {
     ur_usm_desc_t usm_desc{UR_STRUCTURE_TYPE_USM_DESC, &usm_host_desc,
                            /* mem advice flags */ UR_USM_ADVICE_FLAG_DEFAULT,
                            /* alignment */ 0};
-    void *ptr = nullptr;
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, &usm_desc, pool,
                                     allocation_size, &ptr));
@@ -93,9 +92,9 @@ TEST_P(urUSMSharedAllocTest, SuccessWithDescriptors) {
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     EXPECT_SUCCESS(urEventRelease(event));
@@ -107,7 +106,6 @@ TEST_P(urUSMSharedAllocTest, SuccessWithMultipleAdvices) {
         /* mem advice flags */ UR_USM_ADVICE_FLAG_SET_READ_MOSTLY |
             UR_USM_ADVICE_FLAG_BIAS_CACHED,
         /* alignment */ 0};
-    void *ptr = nullptr;
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, &usm_desc, pool,
                                     allocation_size, &ptr));
@@ -115,23 +113,21 @@ TEST_P(urUSMSharedAllocTest, SuccessWithMultipleAdvices) {
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     EXPECT_SUCCESS(urEventRelease(event));
 }
 
 TEST_P(urUSMSharedAllocTest, InvalidNullHandleContext) {
-    void *ptr = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_NULL_HANDLE,
         urUSMSharedAlloc(nullptr, device, nullptr, pool, sizeof(int), &ptr));
 }
 
 TEST_P(urUSMSharedAllocTest, InvalidNullHandleDevice) {
-    void *ptr = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_NULL_HANDLE,
         urUSMSharedAlloc(context, nullptr, nullptr, pool, sizeof(int), &ptr));
@@ -144,14 +140,12 @@ TEST_P(urUSMSharedAllocTest, InvalidNullPtrMem) {
 }
 
 TEST_P(urUSMSharedAllocTest, InvalidUSMSize) {
-    void *ptr = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_USM_SIZE,
         urUSMSharedAlloc(context, device, nullptr, pool, -1, &ptr));
 }
 
 TEST_P(urUSMSharedAllocTest, InvalidValueAlignPowerOfTwo) {
-    void *ptr = nullptr;
     ur_usm_desc_t desc = {};
     desc.stype = UR_STRUCTURE_TYPE_USM_DESC;
     desc.align = 5;
@@ -185,16 +179,15 @@ TEST_P(urUSMSharedAllocAlignmentTest, SuccessAlignedAllocations) {
                            /* mem advice flags */ UR_USM_ADVICE_FLAG_DEFAULT,
                            alignment};
 
-    void *ptr = nullptr;
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, &usm_desc, pool,
                                     allocation_size, &ptr));
     ASSERT_NE(ptr, nullptr);
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
-    ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
+    EXPECT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(pattern), &pattern,
                                     allocation_size, 0, nullptr, &event));
-    ASSERT_SUCCESS(urEventWait(1, &event));
+    EXPECT_SUCCESS(urEventWait(1, &event));
 
     ASSERT_SUCCESS(urUSMFree(context, ptr));
     EXPECT_SUCCESS(urEventRelease(event));

From 6572a044d41286949eb4be35bacfbbfdda825daa Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Wed, 27 Nov 2024 03:39:29 +0100
Subject: [PATCH 023/148] allow uninserted program

---
 source/loader/layers/sanitizer/asan/asan_ddi.cpp      | 11 +++++++----
 .../loader/layers/sanitizer/asan/asan_interceptor.cpp |  3 +++
 .../loader/layers/sanitizer/asan/asan_interceptor.hpp |  6 ++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 489c4cc4e4..9695a8461e 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -55,6 +55,9 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
 bool isInstrumentedKernel(ur_kernel_handle_t hKernel) {
     auto hProgram = GetProgram(hKernel);
     auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
+    if (PI == nullptr) {
+        return false;
+    }
     return PI->isKernelInstrumented(hKernel);
 }
 
@@ -290,8 +293,9 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain(
     UR_CALL(pfnRetain(hProgram));
 
     auto ProgramInfo = getAsanInterceptor()->getProgramInfo(hProgram);
-    UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
-    ProgramInfo->RefCount++;
+    if (ProgramInfo != nullptr) {
+        ProgramInfo->RefCount++;
+    }
 
     return UR_RESULT_SUCCESS;
 }
@@ -419,8 +423,7 @@ ur_result_t UR_APICALL urProgramRelease(
     UR_CALL(pfnProgramRelease(hProgram));
 
     auto ProgramInfo = getAsanInterceptor()->getProgramInfo(hProgram);
-    UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
-    if (--ProgramInfo->RefCount == 0) {
+    if (ProgramInfo != nullptr && --ProgramInfo->RefCount == 0) {
         UR_CALL(getAsanInterceptor()->unregisterProgram(hProgram));
         UR_CALL(getAsanInterceptor()->eraseProgram(hProgram));
     }
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index af5bd59944..95bde5a5b5 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -436,6 +436,7 @@ ur_result_t AsanInterceptor::registerProgram(ur_program_handle_t Program) {
 
 ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
     auto ProgramInfo = getProgramInfo(Program);
+    assert(ProgramInfo != nullptr && "unregistered program!");
 
     for (auto AI : ProgramInfo->AllocInfoForGlobals) {
         UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI));
@@ -483,6 +484,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
         }
 
         auto PI = getProgramInfo(Program);
+        assert(PI != nullptr && "unregistered program!");
         for (const auto &SKI : SKInfo) {
             if (SKI.Size == 0) {
                 continue;
@@ -519,6 +521,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
     auto Context = GetContext(Program);
     auto ContextInfo = getContextInfo(Context);
     auto ProgramInfo = getProgramInfo(Program);
+    assert(ProgramInfo != nullptr && "unregistered program!");
 
     for (auto Device : Devices) {
         ManagedQueue Queue(Context, Device);
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index d8dd11101c..4254fcff15 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -266,8 +266,10 @@ class AsanInterceptor {
 
     std::shared_ptr<ProgramInfo> getProgramInfo(ur_program_handle_t Program) {
         std::shared_lock<ur_shared_mutex> Guard(m_ProgramMapMutex);
-        assert(m_ProgramMap.find(Program) != m_ProgramMap.end());
-        return m_ProgramMap[Program];
+        if (m_ProgramMap.find(Program) != m_ProgramMap.end()) {
+            return m_ProgramMap[Program];
+        }
+        return nullptr;
     }
 
     std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {

From 8411e2bcc841ee82822362af8b81db0c3d99210a Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Wed, 27 Nov 2024 03:50:55 +0100
Subject: [PATCH 024/148] update api usage

---
 source/loader/layers/sanitizer/asan/asan_ddi.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 9695a8461e..741b4d421c 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -369,7 +369,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink(
     UR_CALL(pfnProgramLink(hContext, count, phPrograms, pOptions, phProgram));
 
     UR_CALL(getAsanInterceptor()->insertProgram(*phProgram));
-    UR_CALL(getAsanInterceptor()->registerProgram(hContext, *phProgram));
+    UR_CALL(getAsanInterceptor()->registerProgram(*phProgram));
 
     return UR_RESULT_SUCCESS;
 }
@@ -401,7 +401,7 @@ ur_result_t UR_APICALL urProgramLinkExp(
                               phPrograms, pOptions, phProgram));
 
     UR_CALL(getAsanInterceptor()->insertProgram(*phProgram));
-    UR_CALL(getAsanInterceptor()->registerProgram(hContext, *phProgram));
+    UR_CALL(getAsanInterceptor()->registerProgram(*phProgram));
 
     return UR_RESULT_SUCCESS;
 }

From c67d1283ffa78ac2d348c1c69efd24812e9a4983 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Thu, 21 Nov 2024 14:04:15 +0000
Subject: [PATCH 025/148] Add query to retrieve adapter handle from platform.

---
 include/ur_api.h                              |  4 ++-
 include/ur_print.hpp                          | 16 +++++++++
 scripts/core/platform.yml                     |  4 ++-
 source/adapters/cuda/platform.cpp             |  4 +++
 source/adapters/hip/platform.cpp              |  4 +++
 source/adapters/level_zero/platform.cpp       |  2 ++
 source/adapters/native_cpu/CMakeLists.txt     |  1 +
 source/adapters/native_cpu/adapter.cpp        |  1 +
 source/adapters/native_cpu/adapter.hpp        | 13 ++++++++
 source/adapters/native_cpu/platform.cpp       |  5 +--
 source/adapters/opencl/platform.cpp           |  3 ++
 source/loader/layers/validation/ur_valddi.cpp |  2 +-
 source/loader/ur_ldrddi.cpp                   | 33 +++++++++++++++++++
 source/loader/ur_libapi.cpp                   |  2 +-
 source/ur_api.cpp                             |  2 +-
 .../platform/urPlatformGetInfo.cpp            | 26 +++++++++++++--
 tools/urinfo/urinfo.hpp                       |  2 ++
 17 files changed, 115 insertions(+), 9 deletions(-)
 create mode 100644 source/adapters/native_cpu/adapter.hpp

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..3dd476328f 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -1087,6 +1087,8 @@ typedef enum ur_platform_info_t {
                                       ///< info needs to be dynamically queried.
     UR_PLATFORM_INFO_BACKEND = 6,     ///< [::ur_platform_backend_t] The backend of the platform. Identifies the
                                       ///< native backend adapter implementing this platform.
+    UR_PLATFORM_INFO_ADAPTER = 7,     ///< [::ur_adapter_handle_t] The adapter handle associated with the
+                                      ///< platform.
     /// @cond
     UR_PLATFORM_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1112,7 +1114,7 @@ typedef enum ur_platform_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hPlatform`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_PLATFORM_INFO_BACKEND < propName`
+///         + `::UR_PLATFORM_INFO_ADAPTER < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..40e0c1793d 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -2024,6 +2024,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_platform_info_t value)
     case UR_PLATFORM_INFO_BACKEND:
         os << "UR_PLATFORM_INFO_BACKEND";
         break;
+    case UR_PLATFORM_INFO_ADAPTER:
+        os << "UR_PLATFORM_INFO_ADAPTER";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -2077,6 +2080,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_platform_in
 
         os << ")";
     } break;
+    case UR_PLATFORM_INFO_ADAPTER: {
+        const ur_adapter_handle_t *tptr = (const ur_adapter_handle_t *)ptr;
+        if (sizeof(ur_adapter_handle_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_adapter_handle_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        ur::details::printPtr(os,
+                              *tptr);
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml
index 997f4918ee..d4d7ef6a80 100644
--- a/scripts/core/platform.yml
+++ b/scripts/core/platform.yml
@@ -77,7 +77,9 @@ etors:
     - name: BACKEND
       value: "6"
       desc: "[$x_platform_backend_t] The backend of the platform. Identifies the native backend adapter implementing this platform."
-
+    - name: ADAPTER
+      value: "7"
+      desc: "[$x_adapter_handle_t] The adapter handle associated with the platform."
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Retrieves various information about platform"
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index 7ce0bba9e7..20518494f7 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "platform.hpp"
+#include "adapter.hpp"
 #include "common.hpp"
 #include "context.hpp"
 #include "device.hpp"
@@ -41,6 +42,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
   case UR_PLATFORM_INFO_BACKEND: {
     return ReturnValue(UR_PLATFORM_BACKEND_CUDA);
   }
+  case UR_PLATFORM_INFO_ADAPTER: {
+    return ReturnValue(&adapter);
+  }
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/hip/platform.cpp b/source/adapters/hip/platform.cpp
index 007889f138..fa0b07cc82 100644
--- a/source/adapters/hip/platform.cpp
+++ b/source/adapters/hip/platform.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "platform.hpp"
+#include "adapter.hpp"
 #include "context.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -34,6 +35,9 @@ urPlatformGetInfo(ur_platform_handle_t, ur_platform_info_t propName,
   case UR_PLATFORM_INFO_EXTENSIONS: {
     return ReturnValue("");
   }
+  case UR_PLATFORM_INFO_ADAPTER: {
+    return ReturnValue(&adapter);
+  }
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index 0237b62863..18a417ff1b 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -95,6 +95,8 @@ ur_result_t urPlatformGetInfo(
     return ReturnValue(Platform->ZeDriverApiVersion.c_str());
   case UR_PLATFORM_INFO_BACKEND:
     return ReturnValue(UR_PLATFORM_BACKEND_LEVEL_ZERO);
+  case UR_PLATFORM_INFO_ADAPTER:
+    return ReturnValue(GlobalAdapter);
   default:
     logger::debug("urPlatformGetInfo: unrecognized ParamName");
     return UR_RESULT_ERROR_INVALID_VALUE;
diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 56cfc577d8..17467bfdef 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -9,6 +9,7 @@ set(TARGET_NAME ur_adapter_native_cpu)
 
 add_ur_adapter(${TARGET_NAME}
         SHARED
+        ${CMAKE_CURRENT_SOURCE_DIR}/adapter.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
diff --git a/source/adapters/native_cpu/adapter.cpp b/source/adapters/native_cpu/adapter.cpp
index 2b5b95ccd0..01fffeb01e 100644
--- a/source/adapters/native_cpu/adapter.cpp
+++ b/source/adapters/native_cpu/adapter.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "adapter.hpp"
 #include "common.hpp"
 #include "ur_api.h"
 
diff --git a/source/adapters/native_cpu/adapter.hpp b/source/adapters/native_cpu/adapter.hpp
new file mode 100644
index 0000000000..2607aeb542
--- /dev/null
+++ b/source/adapters/native_cpu/adapter.hpp
@@ -0,0 +1,13 @@
+//===---------------- adapter.hpp - Native CPU Adapter --------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+struct ur_adapter_handle_t_;
+
+extern ur_adapter_handle_t_ Adapter;
diff --git a/source/adapters/native_cpu/platform.cpp b/source/adapters/native_cpu/platform.cpp
index 840f18f8b3..8e55037079 100644
--- a/source/adapters/native_cpu/platform.cpp
+++ b/source/adapters/native_cpu/platform.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "platform.hpp"
+#include "adapter.hpp"
 #include "common.hpp"
 
 #include "ur/ur.hpp"
@@ -75,9 +76,9 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
     return ReturnValue("");
 
   case UR_PLATFORM_INFO_BACKEND:
-    // TODO(alcpz): PR with this enum value at
-    // https://github.com/oneapi-src/unified-runtime
     return ReturnValue(UR_PLATFORM_BACKEND_NATIVE_CPU);
+  case UR_PLATFORM_INFO_ADAPTER:
+    return ReturnValue(&Adapter);
   default:
     DIE_NO_IMPLEMENTATION;
   }
diff --git a/source/adapters/opencl/platform.cpp b/source/adapters/opencl/platform.cpp
index b6d3a77cee..1400f27cf4 100644
--- a/source/adapters/opencl/platform.cpp
+++ b/source/adapters/opencl/platform.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "platform.hpp"
+#include "adapter.hpp"
 
 ur_result_t cl_adapter::getPlatformVersion(cl_platform_id Plat,
                                            oclv::OpenCLVersion &Version) {
@@ -57,6 +58,8 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
   switch (static_cast<uint32_t>(propName)) {
   case UR_PLATFORM_INFO_BACKEND:
     return ReturnValue(UR_PLATFORM_BACKEND_OPENCL);
+  case UR_PLATFORM_INFO_ADAPTER:
+    return ReturnValue(ur::cl::getAdapter());
   case UR_PLATFORM_INFO_NAME:
   case UR_PLATFORM_INFO_VENDOR_NAME:
   case UR_PLATFORM_INFO_VERSION:
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..6e96efe6bf 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -280,7 +280,7 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_PLATFORM_INFO_BACKEND < propName) {
+        if (UR_PLATFORM_INFO_ADAPTER < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 86a6ad95a0..f482eb3560 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -289,10 +289,43 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo(
     // convert loader handle to platform handle
     hPlatform = reinterpret_cast<ur_platform_object_t *>(hPlatform)->handle;
 
+    // this value is needed for converting adapter handles to loader handles
+    size_t sizeret = 0;
+    if (pPropSizeRet == NULL) {
+        pPropSizeRet = &sizeret;
+    }
+
     // forward to device-platform
     result =
         pfnGetInfo(hPlatform, propName, propSize, pPropValue, pPropSizeRet);
 
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        if (pPropValue != nullptr) {
+            switch (propName) {
+            case UR_PLATFORM_INFO_ADAPTER: {
+                ur_adapter_handle_t *handles =
+                    reinterpret_cast<ur_adapter_handle_t *>(pPropValue);
+                size_t nelements = *pPropSizeRet / sizeof(ur_adapter_handle_t);
+                for (size_t i = 0; i < nelements; ++i) {
+                    if (handles[i] != nullptr) {
+                        handles[i] = reinterpret_cast<ur_adapter_handle_t>(
+                            context->factories.ur_adapter_factory.getInstance(
+                                handles[i], dditable));
+                    }
+                }
+            } break;
+            default: {
+            } break;
+            }
+        }
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
     return result;
 }
 
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..3c6822d613 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -557,7 +557,7 @@ ur_result_t UR_APICALL urPlatformGet(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hPlatform`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_PLATFORM_INFO_BACKEND < propName`
+///         + `::UR_PLATFORM_INFO_ADAPTER < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..7f7eb65d40 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -501,7 +501,7 @@ ur_result_t UR_APICALL urPlatformGet(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hPlatform`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_PLATFORM_INFO_BACKEND < propName`
+///         + `::UR_PLATFORM_INFO_ADAPTER < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/test/conformance/platform/urPlatformGetInfo.cpp b/test/conformance/platform/urPlatformGetInfo.cpp
index 1dc92b26d7..3973b8ee6b 100644
--- a/test/conformance/platform/urPlatformGetInfo.cpp
+++ b/test/conformance/platform/urPlatformGetInfo.cpp
@@ -19,7 +19,8 @@ INSTANTIATE_TEST_SUITE_P(
     urPlatformGetInfo, urPlatformGetInfoTest,
     ::testing::Values(UR_PLATFORM_INFO_NAME, UR_PLATFORM_INFO_VENDOR_NAME,
                       UR_PLATFORM_INFO_VERSION, UR_PLATFORM_INFO_EXTENSIONS,
-                      UR_PLATFORM_INFO_PROFILE, UR_PLATFORM_INFO_BACKEND),
+                      UR_PLATFORM_INFO_PROFILE, UR_PLATFORM_INFO_BACKEND,
+                      UR_PLATFORM_INFO_ADAPTER),
     [](const ::testing::TestParamInfo<ur_platform_info_t> &info) {
         std::stringstream ss;
         ss << info.param;
@@ -38,8 +39,29 @@ TEST_P(urPlatformGetInfoTest, Success) {
     std::vector<char> name(size);
     ASSERT_SUCCESS(
         urPlatformGetInfo(platform, info_type, size, name.data(), nullptr));
-    if (info_type != UR_PLATFORM_INFO_BACKEND) {
+    switch (info_type) {
+    case UR_PLATFORM_INFO_NAME:
+    case UR_PLATFORM_INFO_VENDOR_NAME:
+    case UR_PLATFORM_INFO_VERSION:
+    case UR_PLATFORM_INFO_EXTENSIONS:
+    case UR_PLATFORM_INFO_PROFILE: {
         ASSERT_EQ(size, std::strlen(name.data()) + 1);
+        break;
+    }
+    case UR_PLATFORM_INFO_BACKEND: {
+        ASSERT_EQ(size, sizeof(ur_platform_backend_t));
+        break;
+    }
+    case UR_PLATFORM_INFO_ADAPTER: {
+        auto queried_adapter =
+            *reinterpret_cast<ur_adapter_handle_t *>(name.data());
+        auto adapter_found =
+            std::find(adapters.begin(), adapters.end(), queried_adapter);
+        ASSERT_NE(adapter_found, adapters.end());
+        break;
+    }
+    default:
+        break;
     }
 }
 
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 37c7a80328..b1dcb9e57e 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -45,6 +45,8 @@ inline void printPlatformInfos(ur_platform_handle_t hPlatform,
     std::cout << prefix;
     printPlatformInfo<ur_platform_backend_t>(hPlatform,
                                              UR_PLATFORM_INFO_BACKEND);
+    std::cout << prefix;
+    printPlatformInfo<ur_adapter_handle_t>(hPlatform, UR_PLATFORM_INFO_ADAPTER);
 }
 
 inline void printDeviceInfos(ur_device_handle_t hDevice,

From 252b3822f0e81ca2fff3b786ee9b8294bd88cad6 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Wed, 27 Nov 2024 15:33:49 +0100
Subject: [PATCH 026/148] fix event caching

L0 requires the adapter to use a variety of different events, depending
on the specific use case and configuration. Events are also unique for
devices.
And, because the adapter wants to avoid unnecessarily allocating new
events from the driver, this necessities an event caching solution that
can separate the different event type and device combinations.

When counter-based events were introduced the event caching was
not properly expanded to take that event type into consideration,
presumably with the assumption that normal and counter based events
will never coexist. Unfortunately that is not true for the current
adapter implementation.

This patch simplifies the event caching logic, ensuring that
each unique event type and device combination has its own
event cache.
---
 source/adapters/level_zero/context.cpp | 27 +++++++---
 source/adapters/level_zero/context.hpp | 72 +++++++++++++-------------
 2 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 7c1c412ee4..4fd1db0933 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -565,18 +565,26 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
     bool HostVisible, bool WithProfiling, ur_device_handle_t Device,
     bool CounterBasedEventEnabled) {
   std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-  auto Cache = getEventCache(HostVisible, WithProfiling, Device);
-  if (Cache->empty())
+  auto Cache = getEventCache(HostVisible, WithProfiling, Device,
+                             CounterBasedEventEnabled);
+  if (Cache->empty()) {
+    logger::info("Cache empty (Host Visible: {}, Profiling: {}, Counter: {}, "
+                 "Device: {})",
+                 HostVisible, WithProfiling, CounterBasedEventEnabled, Device);
     return nullptr;
+  }
 
   auto It = Cache->begin();
   ur_event_handle_t Event = *It;
-  if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) {
-    return nullptr;
-  }
   Cache->erase(It);
   // We have to reset event before using it.
   Event->reset();
+
+  logger::info("Using {} event (Host Visible: {}, Profiling: {}, Counter: {}, "
+               "Device: {}) from cache {}",
+               Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
+               Event->CounterBasedEventsEnabled, Device, Cache);
+
   return Event;
 }
 
@@ -588,8 +596,13 @@ void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) {
     Device = Event->UrQueue->Device;
   }
 
-  auto Cache = getEventCache(Event->isHostVisible(),
-                             Event->isProfilingEnabled(), Device);
+  auto Cache =
+      getEventCache(Event->isHostVisible(), Event->isProfilingEnabled(), Device,
+                    Event->CounterBasedEventsEnabled);
+  logger::info("Inserting {} event (Host Visible: {}, Profiling: {}, Counter: "
+               "{}, Device: {}) into cache {}",
+               Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
+               Event->CounterBasedEventsEnabled, Device, Cache);
   Cache->emplace_back(Event);
 }
 
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 0d3b2846e2..470c4c4f35 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -169,15 +169,6 @@ struct ur_context_handle_t_ : _ur_object {
   // holding the current pool usage counts.
   ur_mutex ZeEventPoolCacheMutex;
 
-  // Mutex to control operations on event caches.
-  ur_mutex EventCacheMutex;
-
-  // Caches for events.
-  using EventCache = std::vector<std::list<ur_event_handle_t>>;
-  EventCache EventCaches{4};
-  std::vector<std::unordered_map<ur_device_handle_t, size_t>>
-      EventCachesDeviceMap{4};
-
   // Initialize the PI context.
   ur_result_t initialize();
 
@@ -313,36 +304,45 @@ struct ur_context_handle_t_ : _ur_object {
   ze_context_handle_t getZeHandle() const;
 
 private:
+  enum EventFlags {
+    EVENT_FLAG_HOST_VISIBLE = UR_BIT(0),
+    EVENT_FLAG_WITH_PROFILING = UR_BIT(1),
+    EVENT_FLAG_COUNTER = UR_BIT(2),
+    EVENT_FLAG_DEVICE = UR_BIT(3), // if set, subsequent bits are device id
+    MAX_EVENT_FLAG_BITS =
+        4, // this is used as an offset for embedding device id
+  };
+
+  // Mutex to control operations on event caches.
+  ur_mutex EventCacheMutex;
+
+  // Caches for events.
+  using EventCache = std::list<ur_event_handle_t>;
+  std::vector<EventCache> EventCaches;
+
   // Get the cache of events for a provided scope and profiling mode.
-  auto getEventCache(bool HostVisible, bool WithProfiling,
-                     ur_device_handle_t Device) {
+  EventCache *getEventCache(bool HostVisible, bool WithProfiling,
+                            ur_device_handle_t Device, bool Counter) {
+
+    size_t index = 0;
     if (HostVisible) {
-      if (Device) {
-        auto EventCachesMap =
-            WithProfiling ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1];
-        if (EventCachesMap->find(Device) == EventCachesMap->end()) {
-          EventCaches.emplace_back();
-          EventCachesMap->insert(
-              std::make_pair(Device, EventCaches.size() - 1));
-        }
-        return &EventCaches[(*EventCachesMap)[Device]];
-      } else {
-        return WithProfiling ? &EventCaches[0] : &EventCaches[1];
-      }
-    } else {
-      if (Device) {
-        auto EventCachesMap =
-            WithProfiling ? &EventCachesDeviceMap[2] : &EventCachesDeviceMap[3];
-        if (EventCachesMap->find(Device) == EventCachesMap->end()) {
-          EventCaches.emplace_back();
-          EventCachesMap->insert(
-              std::make_pair(Device, EventCaches.size() - 1));
-        }
-        return &EventCaches[(*EventCachesMap)[Device]];
-      } else {
-        return WithProfiling ? &EventCaches[2] : &EventCaches[3];
-      }
+      index |= EVENT_FLAG_HOST_VISIBLE;
+    }
+    if (WithProfiling) {
+      index |= EVENT_FLAG_WITH_PROFILING;
     }
+    if (Counter) {
+      index |= EVENT_FLAG_COUNTER;
+    }
+    if (Device) {
+      index |= EVENT_FLAG_DEVICE | (*Device->Id << MAX_EVENT_FLAG_BITS);
+    }
+
+    if (index >= EventCaches.size()) {
+      EventCaches.resize(index + 1);
+    }
+
+    return &EventCaches[index];
   }
 };
 

From 105d4e079b8a41147f5a908c1c3236ba2bfbb875 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 21 Nov 2024 14:48:13 +0100
Subject: [PATCH 027/148] add 3 more benchmarks

This patch implements support for:
 - dl-cifar
 - dl-mnist
 - svm

These are all workloads from Velocity Bench that require oneMKL.
---
 scripts/benchmarks/benches/base.py     |  18 ++--
 scripts/benchmarks/benches/llamacpp.py |  75 +--------------
 scripts/benchmarks/benches/oneapi.py   |  86 +++++++++++++++++
 scripts/benchmarks/benches/options.py  |   1 +
 scripts/benchmarks/benches/velocity.py | 122 ++++++++++++++++++++++++-
 scripts/benchmarks/main.py             |   1 +
 scripts/benchmarks/utils/utils.py      |   9 +-
 7 files changed, 224 insertions(+), 88 deletions(-)
 create mode 100644 scripts/benchmarks/benches/oneapi.py

diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index feeaa568b6..4356fb0d96 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -40,19 +40,21 @@ def run_bench(self, command, env_vars, ld_library=[]):
             ld_library=ld_library
         ).stdout.decode()
 
-    def create_data_path(self, name):
-        data_path = os.path.join(self.directory, "data", name)
-
-        if options.rebuild and Path(data_path).exists():
-           shutil.rmtree(data_path)
+    def create_data_path(self, name, skip_data_dir = False):
+        if skip_data_dir:
+            data_path = os.path.join(self.directory, name)
+        else:
+            data_path = os.path.join(self.directory, 'data', name)
+            if options.rebuild and Path(data_path).exists():
+                shutil.rmtree(data_path)
 
         Path(data_path).mkdir(parents=True, exist_ok=True)
 
         return data_path
 
-    def download(self, name, url, file, untar = False):
-        self.data_path = self.create_data_path(name)
-        return download(self.data_path, url, file, True)
+    def download(self, name, url, file, untar = False, unzip = False, skip_data_dir = False):
+        self.data_path = self.create_data_path(name, skip_data_dir)
+        return download(self.data_path, url, file, untar, unzip)
 
     def name(self):
         raise NotImplementedError()
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 50dd8d04c6..4a260a09cc 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -6,85 +6,14 @@
 import csv
 import io
 from pathlib import Path
-import re
-import shutil
 from utils.utils import download, git_clone
 from .base import Benchmark, Suite
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
 import os
 
-class OneAPI:
-    # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
-    def __init__(self, directory):
-        self.oneapi_dir = os.path.join(directory, 'oneapi')
-        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
-        # delete if some option is set?
-
-        # can we just hardcode these links?
-        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
-        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
-        return
-
-    def install_package(self, name, url):
-        package_path = os.path.join(self.oneapi_dir, name)
-        if Path(package_path).exists():
-            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
-            return
-
-        package = download(self.oneapi_dir, url, f'package_{name}.sh')
-        try:
-            print(f"installing f{name}")
-            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
-        except:
-            print("oneAPI installation likely exists already")
-            return
-        print(f"f{name} installation complete")
-
-    def package_dir(self, package, dir):
-        return os.path.join(self.oneapi_dir, package, 'latest', dir)
-
-    def package_cmake(self, package):
-        package_lib = self.package_dir(package, 'lib')
-        return os.path.join(package_lib, 'cmake', package)
-
-    def mkl_lib(self):
-        return self.package_dir('mkl', 'lib')
-
-    def mkl_include(self):
-        return self.package_dir('mkl', 'include')
-
-    def mkl_cmake(self):
-        return self.package_cmake('mkl')
-
-    def dnn_lib(self):
-        return self.package_dir('dnnl', 'lib')
-
-    def dnn_include(self):
-        return self.package_dir('dnnl', 'include')
-
-    def dnn_cmake(self):
-        return self.package_cmake('dnnl')
-
-    def tbb_lib(self):
-        return self.package_dir('tbb', 'lib')
-
-    def tbb_cmake(self):
-        return self.package_cmake('tbb')
-
-    def compiler_lib(self):
-        return self.package_dir('compiler', 'lib')
-
-    def ld_libraries(self):
-        return [
-            self.compiler_lib(),
-            self.mkl_lib(),
-            self.tbb_lib(),
-            self.dnn_lib()
-        ]
-
 class LlamaCppBench(Suite):
     def __init__(self, directory):
         if options.sycl is None:
@@ -103,7 +32,7 @@ def setup(self):
 
         self.model = download(self.models_dir, "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf", "Phi-3-mini-4k-instruct-q4.gguf")
 
-        self.oneapi = OneAPI(self.directory)
+        self.oneapi = get_oneapi()
 
         self.build_path = create_build_path(self.directory, 'llamacpp-build')
 
diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py
new file mode 100644
index 0000000000..414c4aa64a
--- /dev/null
+++ b/scripts/benchmarks/benches/oneapi.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+from utils.utils import download, run
+from .options import options
+import os
+
+class OneAPI:
+    # random unique number for benchmark oneAPI installation
+    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
+    def __init__(self):
+        self.oneapi_dir = os.path.join(options.workdir, 'oneapi')
+        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
+        # delete if some option is set?
+
+        # can we just hardcode these links?
+        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
+        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
+        return
+
+    def install_package(self, name, url):
+        package_path = os.path.join(self.oneapi_dir, name)
+        if Path(package_path).exists():
+            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
+            return
+
+        package = download(self.oneapi_dir, url, f'package_{name}.sh')
+        try:
+            print(f"installing f{name}")
+            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
+        except:
+            print("oneAPI installation likely exists already")
+            return
+        print(f"f{name} installation complete")
+
+    def package_dir(self, package, dir):
+        return os.path.join(self.oneapi_dir, package, 'latest', dir)
+
+    def package_cmake(self, package):
+        package_lib = self.package_dir(package, 'lib')
+        return os.path.join(package_lib, 'cmake', package)
+
+    def mkl_lib(self):
+        return self.package_dir('mkl', 'lib')
+
+    def mkl_include(self):
+        return self.package_dir('mkl', 'include')
+
+    def mkl_cmake(self):
+        return self.package_cmake('mkl')
+
+    def dnn_lib(self):
+        return self.package_dir('dnnl', 'lib')
+
+    def dnn_include(self):
+        return self.package_dir('dnnl', 'include')
+
+    def dnn_cmake(self):
+        return self.package_cmake('dnnl')
+
+    def tbb_lib(self):
+        return self.package_dir('tbb', 'lib')
+
+    def tbb_cmake(self):
+        return self.package_cmake('tbb')
+
+    def compiler_lib(self):
+        return self.package_dir('compiler', 'lib')
+
+    def ld_libraries(self):
+        return [
+            self.compiler_lib(),
+            self.mkl_lib(),
+            self.tbb_lib(),
+            self.dnn_lib()
+        ]
+
+oneapi_instance = None
+
+def get_oneapi() -> OneAPI: # oneAPI singleton
+    if not hasattr(get_oneapi, "instance"):
+        get_oneapi.instance = OneAPI()
+    return get_oneapi.instance
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index 5997cdedb8..03b0db7128 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -8,6 +8,7 @@ class Compare(Enum):
 
 @dataclass
 class Options:
+    workdir: str = None
     sycl: str = None
     ur: str = None
     ur_adapter: str = None
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 605cf03fd4..705421d963 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -10,6 +10,9 @@
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
+import shutil
+
 import os
 
 class VelocityBench(Suite):
@@ -35,7 +38,10 @@ def benchmarks(self) -> list[Benchmark]:
             CudaSift(self),
             Easywave(self),
             QuickSilver(self),
-            SobelFilter(self)
+            SobelFilter(self),
+            DLCifar(self),
+            DLMnist(self),
+            SVM(self)
         ]
 
 class VelocityBase(Benchmark):
@@ -50,6 +56,12 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str):
     def download_deps(self):
         return
 
+    def extra_cmake_args(self) -> list[str]:
+        return []
+
+    def ld_libraries(self) -> list[str]:
+        return []
+
     def setup(self):
         self.download_deps()
         self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name)
@@ -62,8 +74,10 @@ def setup(self):
             f"-S {self.code_path}",
             f"-DCMAKE_BUILD_TYPE=Release"
         ]
+        configure_command += self.extra_cmake_args()
+
         run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True)
-        run(f"cmake --build {build_path} -j", add_sycl=True)
+        run(f"cmake --build {build_path} -j", add_sycl=True, ld_library=self.ld_libraries())
 
     def bin_args(self) -> list[str]:
         return []
@@ -82,7 +96,7 @@ def run(self, env_vars) -> list[Result]:
         ]
         command += self.bin_args()
 
-        result = self.run_bench(command, env_vars)
+        result = self.run_bench(command, env_vars, ld_library=self.ld_libraries())
 
         return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, unit=self.unit) ]
 
@@ -136,7 +150,6 @@ def __init__(self, vb: VelocityBench):
 
     def download_deps(self):
         self.download("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz", untar=True)
-        return
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
@@ -228,7 +241,6 @@ def get_last_elapsed_time(self, log_file_path) -> float:
     def parse_output(self, stdout: str) -> float:
         return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log"))
 
-
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("cudaSift", "cudaSift", vb, "ms")
@@ -248,3 +260,103 @@ def parse_output(self, stdout: str) -> float:
             return float(match.group(1))
         else:
             raise ValueError("Failed to parse benchmark output.")
+
+class DLCifar(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-cifar", "dl-cifar_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-cifar hardcodes the path to this dataset as "../../datasets/cifar-10-binary"...
+        self.download("datasets", "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", "cifar-10-binary.tar.gz", untar=True, skip_data_dir=True)
+        return
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-cifar"
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-cifar - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class DLMnist(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-mnist", "dl-mnist-sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-mnist hardcodes the path to this dataset as "../../datasets/"...
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz", "train-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz", "train-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz", "t10k-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz", "t10k-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-mnist"
+
+    def bin_args(self):
+        return [
+            "-conv_algo", "ONEDNN_AUTO"
+        ]
+
+    # TODO: This shouldn't be required.
+    # The application crashes with a segfault without it.
+    def extra_env_vars(self):
+        return {
+            "NEOReadDebugKeys":"1",
+            "DisableScratchPages":"0",
+        }
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-mnist - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class SVM(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("svm", "svm_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench svm"
+
+    def bin_args(self):
+        return [
+            f"{self.code_path}/a9a",
+            f"{self.code_path}/a.m",
+        ]
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'Total      elapsed time : (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index c83825c9e5..bca0f01553 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -179,6 +179,7 @@ def validate_and_parse_env_args(env_args):
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
+    options.workdir = args.benchmark_directory
     options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.sycl = args.sycl
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index d077184e5c..0bb954fab2 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import gzip
 import os
 import shutil
 import subprocess
@@ -58,7 +59,7 @@ def git_clone(dir, name, repo, commit):
     return repo_path
 
 def prepare_bench_cwd(dir):
-    # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
+    # we need 2 deep to workaround a problem with a fixed relative paths in some velocity benchmarks
     options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
     if os.path.exists(options.benchmark_cwd):
         shutil.rmtree(options.benchmark_cwd)
@@ -97,7 +98,7 @@ def create_build_path(directory, name):
 
     return build_path
 
-def download(dir, url, file, untar = False):
+def download(dir, url, file, untar = False, unzip = False):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
         print(f"{data_file} does not exist, downloading")
@@ -106,6 +107,10 @@ def download(dir, url, file, untar = False):
             file = tarfile.open(data_file)
             file.extractall(dir)
             file.close()
+        if unzip:
+            [stripped_gz, _] = os.path.splitext(data_file)
+            with gzip.open(data_file, 'rb') as f_in, open(stripped_gz, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
     else:
         print(f"{data_file} exists, skipping...")
     return data_file

From 008c998d0d03063b53c5d078b4074e12fc7b138f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 2 Dec 2024 12:16:10 +0000
Subject: [PATCH 028/148] Add sanitizer ignorelist for cfi

It seems that, at least if LLVM is to be beleived, this failure is due
to something in libc++. This change adds an ignorelist that ignores said
function.
---
 cmake/helpers.cmake                                | 14 ++++++++++----
 sanitizer-ignorelist.txt                           |  6 ++++++
 .../exp_command_buffer_adapter_cuda.match          | 11 -----------
 .../exp_command_buffer_adapter_hip.match           | 10 ----------
 4 files changed, 16 insertions(+), 25 deletions(-)
 create mode 100644 sanitizer-ignorelist.txt
 delete mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
 delete mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 8cc5353609..c0fd7ab90c 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -79,6 +79,13 @@ else()
     set(CXX_HAS_CFI_SANITIZE OFF)
 endif()
 
+set(CFI_FLAGS "")
+if (CFI_HAS_CFI_SANITIZE)
+    # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't
+    # guarantee. -fsanitize=cfi depends on -flto
+    set(CFI_FLAGS "-flto -fsanitize=cfi -fno-sanitize=cfi-icall -fsanitize-ignorelist=${CMAKE_SOURCE_DIR}/sanitizer-ignorelist.txt")
+endif()
+
 function(add_ur_target_compile_options name)
     if(NOT MSVC)
         target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2)
@@ -95,9 +102,8 @@ function(add_ur_target_compile_options name)
             -fPIC
             -fstack-protector-strong
             -fvisibility=hidden
-            # cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't
-            # guarantee. -fsanitize=cfi depends on -flto
-            $<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
+
+            ${CFI_FLAGS}
             $<$<BOOL:${CXX_HAS_FCF_PROTECTION_FULL}>:-fcf-protection=full>
             $<$<BOOL:${CXX_HAS_FSTACK_CLASH_PROTECTION}>:-fstack-clash-protection>
 
@@ -135,7 +141,7 @@ function(add_ur_target_link_options name)
     if(NOT MSVC)
         if (NOT APPLE)
             target_link_options(${name} PRIVATE
-                $<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
+                ${CFI_FLAGS}
                 "LINKER:-z,relro,-z,now,-z,noexecstack"
             )
             if (UR_DEVELOPER_MODE)
diff --git a/sanitizer-ignorelist.txt b/sanitizer-ignorelist.txt
new file mode 100644
index 0000000000..85e8adc38d
--- /dev/null
+++ b/sanitizer-ignorelist.txt
@@ -0,0 +1,6 @@
+[cfi-unrelated-cast]
+# std::_Sp_counted_ptr_inplace::_Sp_counted_ptr_inplace() (libstdc++).
+# This ctor is used by std::make_shared and needs to cast to uninitialized T*
+# in order to call std::allocator_traits<T>::construct.
+# See: https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/cfi/cfi_ignorelist.txt
+fun:_ZNSt23_Sp_counted_ptr_inplace*
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
deleted file mode 100644
index 40182b9125..0000000000
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match
+++ /dev/null
@@ -1,11 +0,0 @@
-# Note: This file is only for use with cts_exe.py
-# These cause SIGILL when built with -fsanitize=cfi on Nvidia
-{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/*
-{{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/*
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
deleted file mode 100644
index da8d6dee07..0000000000
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
+++ /dev/null
@@ -1,10 +0,0 @@
-# Note: This file is only for use with cts_exe.py
-# These cause SIGILL when built with -fsanitize=cfi on AMD
-{{OPT}}urCommandBufferKernelHandleUpdateTest.Success/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.UpdateAgain/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RestoreOriginalKernel/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.KernelAlternativeNotRegistered/*
-{{OPT}}urCommandBufferKernelHandleUpdateTest.RegisterInvalidKernelAlternative/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateDimensionsWithoutUpdatingKernel/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.UpdateOnlyLocalWorkSize/*
-{{OPT}}urCommandBufferValidUpdateParametersTest.SuccessNullptrHandle/*

From 4e42dd8b9538e7935f78fa78e22ef04d44fa45c9 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Tue, 17 Sep 2024 17:39:08 +0100
Subject: [PATCH 029/148] Add SDL hardening flags for MSVC

Enable compiler flags which harden MSVC builds against mistakes and
vulnerabilities.
---
 cmake/helpers.cmake                           | 33 +++++++++++++------
 .../conformance/exp_command_buffer/fixtures.h |  1 +
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 8cc5353609..191b1746d5 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -114,18 +114,28 @@ function(add_ur_target_compile_options name)
     elseif(MSVC)
         target_compile_options(${name} PRIVATE
             $<$<CXX_COMPILER_ID:MSVC>:/MP>  # clang-cl.exe does not support /MP
-            /W3
             /MD$<$<CONFIG:Debug>:d>
-            /GS
-            /DWIN32_LEAN_AND_MEAN
-            /DNOMINMAX
+
+            /W3
+            /GS     # Enable: Buffer security check
+            /Gy     # Enable: Function-level linking
+
+            $<$<CONFIG:Release>:/sdl>             # Enable: Additional SDL checks
+            $<$<CXX_COMPILER_ID:MSVC>:/Qspectre>  # Enable: Mitigate Spectre variant 1 vulnerabilities
+
+            /wd4267  # Disable: 'var' : conversion from 'size_t' to 'type', possible loss of data
+            /wd6244  # Disable: local declaration of 'variable' hides previous declaration
+            /wd6246  # Disable: local declaration of 'variable' hides declaration of same name in outer scope
+        )
+
+        target_compile_definitions(${name} PRIVATE
+            WIN32_LEAN_AND_MEAN NOMINMAX  # Cajole Windows.h to define fewer symbols
+            _CRT_SECURE_NO_WARNINGS       # Slience warnings about getenv
         )
 
         if(UR_DEVELOPER_MODE)
-            # _CRT_SECURE_NO_WARNINGS used mainly because of getenv
-            # C4267: The compiler detected a conversion from size_t to a smaller type.
             target_compile_options(${name} PRIVATE
-                /WX /GS /D_CRT_SECURE_NO_WARNINGS /wd4267
+                /WX  # Enable: Treat all warnings as errors
             )
         endif()
     endif()
@@ -149,9 +159,12 @@ function(add_ur_target_link_options name)
         endif()
     elseif(MSVC)
         target_link_options(${name} PRIVATE
-            LINKER:/DYNAMICBASE
-            LINKER:/HIGHENTROPYVA
-            LINKER:/NXCOMPAT
+            LINKER:/DYNAMICBASE     # Enable: Modify header to indicate ASLR should be use
+            LINKER:/HIGHENTROPYVA   # Enable: High-entropy address space layout randomization (ASLR)
+            $<$<CONFIG:Release>:
+                LINKER:/NXCOMPAT    # Enable: Data Execution Prevention
+                LINKER:/LTCG        # Enable: Link-time code generation
+            >
         )
     endif()
 endfunction()
diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h
index 42bee05b5a..959b3e7bb1 100644
--- a/test/conformance/exp_command_buffer/fixtures.h
+++ b/test/conformance/exp_command_buffer/fixtures.h
@@ -6,6 +6,7 @@
 #ifndef UR_CONFORMANCE_COMMAND_BUFFER_FIXTURES_H_INCLUDED
 #define UR_CONFORMANCE_COMMAND_BUFFER_FIXTURES_H_INCLUDED
 
+#include <array>
 #include <uur/fixtures.h>
 
 namespace uur {

From 79f3ccfccd80bdfe7b651f4d895277f15adb051c Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 8 Nov 2024 10:21:34 +0000
Subject: [PATCH 030/148] [CUDA][HIP] Fix for command-buffer local argument
 update

After setting kernel arguments during update, we
need to reset the amount of local memory used.
---
 source/adapters/cuda/command_buffer.cpp       |  22 +-
 .../device_code/saxpy_usm_local_mem.cpp       |  18 +-
 .../update/local_memory_update.cpp            | 505 ++++++++++++++----
 3 files changed, 431 insertions(+), 114 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 527c339783..ec215c8c22 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -1396,14 +1396,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
 
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
+  const auto LocalSize = KernelCommandHandle->Kernel->getLocalSize();
+  if (LocalSize != 0) {
+    // Clean the local size, otherwise calling updateKernelArguments() in
+    // future updates with local arguments will incorrectly increase the
+    // size further.
+    KernelCommandHandle->Kernel->clearLocalSize();
+  }
+
   Params.func = CuFunc;
-  Params.gridDimX = BlocksPerGrid[0];
-  Params.gridDimY = BlocksPerGrid[1];
-  Params.gridDimZ = BlocksPerGrid[2];
-  Params.blockDimX = ThreadsPerBlock[0];
-  Params.blockDimY = ThreadsPerBlock[1];
-  Params.blockDimZ = ThreadsPerBlock[2];
-  Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
+  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
+  Params.sharedMemBytes = LocalSize;
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
 
diff --git a/test/conformance/device_code/saxpy_usm_local_mem.cpp b/test/conformance/device_code/saxpy_usm_local_mem.cpp
index 7ef17e59b5..c978caa27d 100644
--- a/test/conformance/device_code/saxpy_usm_local_mem.cpp
+++ b/test/conformance/device_code/saxpy_usm_local_mem.cpp
@@ -15,15 +15,27 @@ int main() {
     uint32_t A = 42;
 
     sycl_queue.submit([&](sycl::handler &cgh) {
-        sycl::local_accessor<uint32_t, 1> local_mem(local_size, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_A(local_size, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_B(1, cgh);
+
         cgh.parallel_for<class saxpy_usm_local_mem>(
             sycl::nd_range<1>{{array_size}, {local_size}},
             [=](sycl::nd_item<1> itemId) {
                 auto i = itemId.get_global_linear_id();
                 auto local_id = itemId.get_local_linear_id();
-                local_mem[local_id] = i;
-                Z[i] = A * X[i] + Y[i] + local_mem[local_id] +
+
+                local_mem_A[local_id] = i;
+                if (i == 0) {
+                    local_mem_B[0] = 0xA;
+                }
+
+                Z[i] = A * X[i] + Y[i] + local_mem_A[local_id] +
                        itemId.get_local_range(0);
+
+                if (i == 0) {
+                    Z[i] += local_mem_B[0];
+                }
+
             });
     });
     return 0;
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index c295556fdb..b9a9e1ba01 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -8,8 +8,7 @@
 #include <cstring>
 
 // Test that updating a command-buffer with a single kernel command
-// taking a local memory argument works correctly.
-
+// taking local memory arguments works correctly.
 struct LocalMemoryUpdateTestBase
     : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {
     virtual void SetUp() override {
@@ -38,11 +37,11 @@ struct LocalMemoryUpdateTestBase
             std::memcpy(shared_ptr, pattern.data(), allocation_size);
         }
         size_t current_index = 0;
-        // Index 0 is local_mem arg
+        // Index 0 is local_mem_a arg
         ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
-                                           local_mem_size, nullptr));
+                                           local_mem_a_size, nullptr));
 
-        //Hip has extr args for local mem at index 1-3
+        // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
                                                sizeof(local_size), nullptr,
@@ -55,16 +54,31 @@ struct LocalMemoryUpdateTestBase
                                                &local_size));
         }
 
-        // Index 1 is output
+        // Index 1 is local_mem_b arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_b_size, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 2 is output
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[0]));
-        // Index 2 is A
+        // Index 3 is A
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A),
                                            nullptr, &A));
-        // Index 3 is X
+        // Index 4 is X
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[1]));
-        // Index 4 is Y
+        // Index 5 is Y
         ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
                                              shared_ptrs[2]));
     }
@@ -73,6 +87,9 @@ struct LocalMemoryUpdateTestBase
                   size_t length, size_t local_size) {
         for (size_t i = 0; i < length; i++) {
             uint32_t result = A * X[i] + Y[i] + i + local_size;
+            if (i == 0) {
+                result += 0xA;
+            }
             ASSERT_EQ(result, output[i]);
         }
     }
@@ -89,7 +106,8 @@ struct LocalMemoryUpdateTestBase
     }
 
     static constexpr size_t local_size = 4;
-    static constexpr size_t local_mem_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = sizeof(uint32_t);
     static constexpr size_t global_size = 16;
     static constexpr size_t global_offset = 0;
     static constexpr size_t n_dimensions = 1;
@@ -127,7 +145,9 @@ struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase {
 
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTest);
 
-TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -139,63 +159,154 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
     Validate(output, X, Y, A, global_size, local_size);
 
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_b at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
+}
+
+// Test updating A,X,Y parameters to new values and omitting local memory parameters
+// from the update.
+TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 1>
+        new_value_descs;
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
     };
 
-    // New Y at index 4
+    // New X at index 4
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
 
     // Update kernel and enqueue command-buffer again
@@ -212,7 +323,9 @@ TEST_P(LocalMemoryUpdateTest, UpdateParameters) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
-TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to new values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -230,7 +343,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
 
     size_t new_local_size = local_size * 2;
     size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
@@ -267,33 +380,187 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersAndLocalSize) {
         });
     }
 
-    // New A at index 2
+    // New local_mem_b at index 1
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_size,                                         // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
     uint32_t new_A = 33;
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     });
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                                        // hNewKernel
+        0,                                             // numNewMemObjArgs
+        2,                                             // numNewPointerArgs
+        static_cast<uint32_t>(new_value_descs.size()), // numNewValueArgs
+        n_dimensions,                                  // newWorkDim
+        nullptr,                                       // pNewMemObjArgList
+        new_input_descs,                               // pNewPointerArgList
+        new_value_descs.data(),                        // pNewValueArgList
+        nullptr,                                       // pNewGlobalWorkOffset
+        nullptr,                                       // pNewGlobalWorkSize
+        &new_local_size,                               // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size);
+}
+
+// Test updating A,X,Y parameters to new values and only one of the local memory
+// parameters, which is set to a new values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        new_value_descs{};
+
+    size_t new_local_size = local_size * 2;
+    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    // New local_mem_a at index 0
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        new_local_mem_size,                                         // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            1,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            2,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            3,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    });
+
+    // New X at index 4
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
@@ -333,10 +600,12 @@ struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
 
         // Append kernel command to command-buffer and close command-buffer
         for (unsigned node = 0; node < nodes; node++) {
-            // We need to set the local memory arg each time because it is
-            // cleared in the kernel handle after being used.
+#if 1 // TODO can we remove?                                                   \
+    // We need to set the local memory arg each time because it is             \
+    // cleared in the kernel handle after being used.
             ASSERT_SUCCESS(
-                urKernelSetArgLocal(kernel, 0, local_mem_size, nullptr));
+                urKernelSetArgLocal(kernel, 0, local_mem_a_size, nullptr));
+#endif
             ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
                 updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
                 &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr,
@@ -363,6 +632,8 @@ struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
 
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryMultiUpdateTest);
 
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values.
 TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -375,63 +646,75 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     Validate(output, X, Y, A, global_size, local_size);
 
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_b at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
 
     // Update kernel and enqueue command-buffer again
@@ -450,65 +733,79 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateParameters) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to original values, but without doing a blocking wait.
 TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) {
     // Update inputs
-    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
-    ur_exp_command_buffer_update_value_arg_desc_t new_value_descs[2];
+    std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
+        new_input_descs;
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
+        new_value_descs;
 
-    // New local_mem at index 0
+    // New local_mem_a at index 0
     new_value_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        local_mem_size,                                             // argSize
+        local_mem_a_size,                                           // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     };
 
-    // New A at index 2
-    uint32_t new_A = 33;
+    // New local_mem_a at index 1
     new_value_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
-        2 + hip_arg_offset,                                         // argIndex
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs[2] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
         sizeof(new_A),                                              // argSize
         nullptr, // pProperties
         &new_A,  // hArgValue
     };
 
-    // New X at index 3
+    // New X at index 4
     new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        3 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[3],    // pArgValue
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
     };
 
-    // New Y at index 4
+    // New Y at index 5
     new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
-        4 + hip_arg_offset, // argIndex
-        nullptr,            // pProperties
-        &shared_ptrs[4],    // pArgValue
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
     };
 
     // Update kernel inputs
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
-        kernel,          // hNewKernel
-        0,               // numNewMemObjArgs
-        2,               // numNewPointerArgs
-        2,               // numNewValueArgs
-        n_dimensions,    // newWorkDim
-        nullptr,         // pNewMemObjArgList
-        new_input_descs, // pNewPointerArgList
-        new_value_descs, // pNewValueArgList
-        nullptr,         // pNewGlobalWorkOffset
-        nullptr,         // pNewGlobalWorkSize
-        nullptr,         // pNewLocalWorkSize
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        new_input_descs.size(), // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        new_input_descs.data(), // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
     };
     // Enqueue without calling urQueueFinish after
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,

From 31dc7900375176ca92f55aefa6e216152ffe30a6 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 14 Nov 2024 09:10:33 +0000
Subject: [PATCH 031/148] Improve solution

Iterate on previous solution so that the local argument
offsets at following inidices are updated when an earlier
local argument is updated
---
 source/adapters/cuda/command_buffer.cpp       |  25 +-
 source/adapters/cuda/enqueue.cpp              |   6 -
 source/adapters/cuda/kernel.hpp               |  98 +++++--
 source/adapters/hip/command_buffer.cpp        |   3 -
 source/adapters/hip/enqueue.cpp               |   2 -
 source/adapters/hip/kernel.hpp                | 100 +++++--
 .../device_code/saxpy_usm_local_mem.cpp       |  15 +-
 ...xp_command_buffer_adapter_native_cpu.match |   7 +-
 .../update/local_memory_update.cpp            | 258 ++++++++++++++++--
 9 files changed, 411 insertions(+), 103 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index ec215c8c22..4b4b2cffe5 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -522,9 +522,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                         DepsList.data(), DepsList.size(),
                                         &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Add signal node if external return event is used.
     CUgraphNode SignalNode = nullptr;
     if (phEvent) {
@@ -1396,22 +1393,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
 
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
-  const auto LocalSize = KernelCommandHandle->Kernel->getLocalSize();
-  if (LocalSize != 0) {
-    // Clean the local size, otherwise calling updateKernelArguments() in
-    // future updates with local arguments will incorrectly increase the
-    // size further.
-    KernelCommandHandle->Kernel->clearLocalSize();
-  }
-
   Params.func = CuFunc;
-  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
-  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
-  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
-  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
-  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
-  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
-  Params.sharedMemBytes = LocalSize;
+  Params.gridDimX = BlocksPerGrid[0];
+  Params.gridDimY = BlocksPerGrid[1];
+  Params.gridDimZ = BlocksPerGrid[2];
+  Params.blockDimX = ThreadsPerBlock[0];
+  Params.blockDimY = ThreadsPerBlock[1];
+  Params.blockDimZ = ThreadsPerBlock[2];
+  Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
 
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index fc3d0220e8..54a0f778fb 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -493,9 +493,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
         CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
@@ -673,9 +670,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
                                     const_cast<void **>(ArgIndices.data()),
                                     nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 7ad20a4f0e..31088da7cc 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -61,10 +61,22 @@ struct ur_kernel_handle_t_ {
     using args_t = std::array<char, MaxParamBytes>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
+    /// Storage shared by all args which is mem copied into when adding a new
+    /// argument.
     args_t Storage;
+    /// Aligned size of each parameter, including padding.
     args_size_t ParamSizes;
+    /// Byte offset into /p Storage allocation for each parameter.
     args_index_t Indices;
-    args_size_t OffsetPerIndex;
+    /// Aligned size in bytes for each local memory parameter after padding has
+    /// been added. Zero if the argument at the index isn't a local memory
+    /// argument.
+    args_size_t AlignedLocalMemSize;
+    /// Original size in bytes for each local memory parameter, prior to being
+    /// padded to appropriate alignment. Zero if the argument at the index
+    /// isn't a local memory argument.
+    args_size_t OriginalLocalMemSize;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -93,7 +105,8 @@ struct ur_kernel_handle_t_ {
         Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
         ParamSizes.resize(Index + 1);
-        OffsetPerIndex.resize(Index + 1);
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
@@ -102,28 +115,83 @@ struct ur_kernel_handle_t_ {
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
-      OffsetPerIndex[Index] = LocalSize;
+      AlignedLocalMemSize[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
+    /// Returns the padded size and offset of a local memory argument.
+    /// Local memory arguments need to be padded if the alignment for the size
+    /// doesn't match the current offset into the kernel local data.
+    /// @param Index Kernel arg index.
+    /// @param Size User passed size of local parameter.
+    /// @return Tuple of (Aligned size, Aligned offset into local data).
+    std::pair<size_t, size_t> calcAlignedLocalArgument(size_t Index,
+                                                       size_t Size) {
+      // Store the unpadded size of the local argument
+      if (Index + 2 > Indices.size()) {
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
+      }
+      OriginalLocalMemSize[Index] = Size;
+
+      // Calculate the current starting offset into local data
+      const size_t LocalOffset = std::accumulate(
+          std::begin(AlignedLocalMemSize),
+          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
 
-      // maximum required alignment is the size of the largest vector type
+      // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
 
-      // for arguments smaller than the maximum alignment simply align to the
+      // For arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
-      // align the argument
+      // Align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      const size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
 
+      const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset);
+      return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      size_t AlignedLocalSize, AlignedLocalOffset;
+      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-             Size + (AlignedLocalOffset - LocalOffset));
+             AlignedLocalSize);
+
+      // For every existing local argument which follows at later argument
+      // indices, updated the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+        const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
+        if (OriginalLocalSize == 0) {
+          // Skip if successor argument isn't a local memory arg
+          continue;
+        }
+
+        // Recalculate alignment
+        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
+        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+            calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
+
+        // Store new local memory size
+        AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
+
+        // Store new offset into local data
+        const size_t InsertPos =
+            std::accumulate(std::begin(ParamSizes),
+                            std::begin(ParamSizes) + SuccIndex, size_t{0});
+        std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+                    sizeof(size_t));
+      }
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
@@ -145,15 +213,11 @@ struct ur_kernel_handle_t_ {
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clearLocalSize() {
-      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return std::accumulate(std::begin(AlignedLocalMemSize),
+                             std::end(AlignedLocalMemSize), 0);
     }
   } Args;
 
@@ -240,7 +304,5 @@ struct ur_kernel_handle_t_ {
 
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
-  void clearLocalSize() { Args.clearLocalSize(); }
-
   size_t getRegsPerThread() const noexcept { return RegsPerThread; };
 };
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 9fed5db2f8..538c2ff85a 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -396,9 +396,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                          DepsList.data(), DepsList.size(),
                                          &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Get sync point and register the node with it.
     auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 025a3f41f4..b9aa097848 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -324,8 +324,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
         hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr));
 
-    hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index afea69832b..08441e288f 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -56,10 +56,22 @@ struct ur_kernel_handle_t_ {
     using args_t = std::array<char, MAX_PARAM_BYTES>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
+    /// Storage shared by all args which is mem copied into when adding a new
+    /// argument.
     args_t Storage;
+    /// Aligned size of each parameter, including padding.
     args_size_t ParamSizes;
+    /// Byte offset into /p Storage allocation for each parameter.
     args_index_t Indices;
-    args_size_t OffsetPerIndex;
+    /// Aligned size in bytes for each local memory parameter after padding has
+    /// been added. Zero if the argument at the index isn't a local memory
+    /// argument.
+    args_size_t AlignedLocalMemSize;
+    /// Original size in bytes for each local memory parameter, prior to being
+    /// padded to appropriate alignment. Zero if the argument at the index
+    /// isn't a local memory argument.
+    args_size_t OriginalLocalMemSize;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -88,7 +100,8 @@ struct ur_kernel_handle_t_ {
         Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
         ParamSizes.resize(Index + 1);
-        OffsetPerIndex.resize(Index + 1);
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
@@ -97,28 +110,83 @@ struct ur_kernel_handle_t_ {
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
-      OffsetPerIndex[Index] = LocalSize;
+      AlignedLocalMemSize[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
+    /// Returns the padded size and offset of a local memory argument.
+    /// Local memory arguments need to be padded if the alignment for the size
+    /// doesn't match the current offset into the kernel local data.
+    /// @param Index Kernel arg index.
+    /// @param Size User passed size of local parameter.
+    /// @return Tuple of (Aligned size, Aligned offset into local data).
+    std::pair<size_t, size_t> calcAlignedLocalArgument(size_t Index,
+                                                       size_t Size) {
+      // Store the unpadded size of the local argument
+      if (Index + 2 > Indices.size()) {
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
+      }
+      OriginalLocalMemSize[Index] = Size;
 
-      // maximum required alignment is the size of the largest vector type
+      // Calculate the current starting offset into local data
+      const size_t LocalOffset = std::accumulate(
+          std::begin(AlignedLocalMemSize),
+          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
+
+      // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
 
-      // for arguments smaller than the maximum alignment simply align to the
+      // For arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
-      // align the argument
+      // Align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      const size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
 
-      addArg(Index, sizeof(size_t), (const void *)&AlignedLocalOffset,
-             Size + AlignedLocalOffset - LocalOffset);
+      const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset);
+      return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      size_t AlignedLocalSize, AlignedLocalOffset;
+      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             AlignedLocalSize);
+
+      // For every existing local argument which follows at later argument
+      // indices, updated the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+        const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
+        if (OriginalLocalSize == 0) {
+          // Skip if successor argument isn't a local memory arg
+          continue;
+        }
+
+        // Recalculate alignment
+        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
+        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+            calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
+
+        // Store new local memory size
+        AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
+
+        // Store new offset into local data
+        const size_t InsertPos =
+            std::accumulate(std::begin(ParamSizes),
+                            std::begin(ParamSizes) + SuccIndex, size_t{0});
+        std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+                    sizeof(size_t));
+      }
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
@@ -140,15 +208,11 @@ struct ur_kernel_handle_t_ {
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clearLocalSize() {
-      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return std::accumulate(std::begin(AlignedLocalMemSize),
+                             std::end(AlignedLocalMemSize), 0);
     }
   } Args;
 
@@ -220,6 +284,4 @@ struct ur_kernel_handle_t_ {
   }
 
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
-
-  void clearLocalSize() { Args.clearLocalSize(); }
 };
diff --git a/test/conformance/device_code/saxpy_usm_local_mem.cpp b/test/conformance/device_code/saxpy_usm_local_mem.cpp
index c978caa27d..c2bc3adc5e 100644
--- a/test/conformance/device_code/saxpy_usm_local_mem.cpp
+++ b/test/conformance/device_code/saxpy_usm_local_mem.cpp
@@ -16,7 +16,7 @@ int main() {
 
     sycl_queue.submit([&](sycl::handler &cgh) {
         sycl::local_accessor<uint32_t, 1> local_mem_A(local_size, cgh);
-        sycl::local_accessor<uint32_t, 1> local_mem_B(1, cgh);
+        sycl::local_accessor<uint32_t, 1> local_mem_B(local_size * 2, cgh);
 
         cgh.parallel_for<class saxpy_usm_local_mem>(
             sycl::nd_range<1>{{array_size}, {local_size}},
@@ -25,17 +25,12 @@ int main() {
                 auto local_id = itemId.get_local_linear_id();
 
                 local_mem_A[local_id] = i;
-                if (i == 0) {
-                    local_mem_B[0] = 0xA;
-                }
+                local_mem_B[local_id * 2] = -i;
+                local_mem_B[(local_id * 2) + 1] = itemId.get_local_range(0);
 
                 Z[i] = A * X[i] + Y[i] + local_mem_A[local_id] +
-                       itemId.get_local_range(0);
-
-                if (i == 0) {
-                    Z[i] += local_mem_B[0];
-                }
-
+                       local_mem_B[local_id * 2] +
+                       local_mem_B[(local_id * 2) + 1];
             });
     });
     return 0;
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index c6fe7ad962..e0543e81ed 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -37,7 +37,10 @@
 {{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/*
-{{OPT}}LocalMemoryUpdateTest.UpdateParameters/*
-{{OPT}}LocalMemoryUpdateTest.UpdateParametersAndLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersSameLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersEmptyLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersSmallerLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersLargerLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateParametersPartialLocalSize/*
 {{OPT}}LocalMemoryMultiUpdateTest.UpdateParameters/*
 {{OPT}}LocalMemoryMultiUpdateTest.UpdateWithoutBlocking/*
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index b9a9e1ba01..273f770303 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -86,10 +86,7 @@ struct LocalMemoryUpdateTestBase
     void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
                   size_t length, size_t local_size) {
         for (size_t i = 0; i < length; i++) {
-            uint32_t result = A * X[i] + Y[i] + i + local_size;
-            if (i == 0) {
-                result += 0xA;
-            }
+            uint32_t result = A * X[i] + Y[i] + local_size;
             ASSERT_EQ(result, output[i]);
         }
     }
@@ -107,7 +104,7 @@ struct LocalMemoryUpdateTestBase
 
     static constexpr size_t local_size = 4;
     static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
-    static constexpr size_t local_mem_b_size = sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = local_mem_a_size * 2;
     static constexpr size_t global_size = 16;
     static constexpr size_t global_offset = 0;
     static constexpr size_t n_dimensions = 1;
@@ -246,7 +243,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
 
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
-TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
+TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -275,7 +272,7 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
     };
 
     // New X at index 4
-    new_input_descs[1] = {
+    new_input_descs[0] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
         4 + (2 * hip_arg_offset), // argIndex
@@ -284,7 +281,7 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
     };
 
     // New Y at index 5
-    new_input_descs[2] = {
+    new_input_descs[1] = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
         nullptr,                                                      // pNext
         5 + (2 * hip_arg_offset), // argIndex
@@ -324,8 +321,163 @@ TEST_P(LocalMemoryUpdateTest, DISABLED_UpdateParametersEmptyLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and local memory parameters
-// to new values.
-TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
+// to new smaller values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        new_value_descs{};
+
+    size_t new_local_size = 2;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
+    // New local_mem_a at index 0
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        new_local_mem_a_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            1,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            2,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            3,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New local_mem_b at index 1
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_b_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        new_value_descs.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    // New A at index 3
+    uint32_t new_A = 33;
+    new_value_descs.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        3 + (2 * hip_arg_offset),                                   // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    });
+
+    // New X at index 4
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        4 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[3],          // pArgValue
+    };
+
+    // New Y at index 5
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        5 + (2 * hip_arg_offset), // argIndex
+        nullptr,                  // pProperties
+        &shared_ptrs[4],          // pArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                                        // hNewKernel
+        0,                                             // numNewMemObjArgs
+        2,                                             // numNewPointerArgs
+        static_cast<uint32_t>(new_value_descs.size()), // numNewValueArgs
+        n_dimensions,                                  // newWorkDim
+        nullptr,                                       // pNewMemObjArgList
+        new_input_descs,                               // pNewPointerArgList
+        new_value_descs.data(),                        // pNewValueArgList
+        nullptr,                                       // pNewGlobalWorkOffset
+        nullptr,                                       // pNewGlobalWorkSize
+        &new_local_size,                               // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size, new_local_size);
+}
+
+// Test updating A,X,Y parameters to new values and local memory parameters
+// to new larger values.
+TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
@@ -341,14 +493,14 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
         new_value_descs{};
 
-    size_t new_local_size = local_size * 2;
-    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_size = local_size * 4;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
     // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_a_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -381,11 +533,12 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
     }
 
     // New local_mem_b at index 1
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         1 + hip_arg_offset,                                         // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_b_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -478,7 +631,8 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersDifferentLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and only one of the local memory
-// parameters, which is set to a new values.
+// parameters, which is set to a new values. Then a separate update call for
+// the other local memory argument.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     // Run command-buffer prior to update an verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -495,14 +649,14 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
         new_value_descs{};
 
-    size_t new_local_size = local_size * 2;
-    size_t new_local_mem_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_size = local_size * 4;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
     // New local_mem_a at index 0
     new_value_descs.push_back({
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
         nullptr,                                                    // pNext
         0,                                                          // argIndex
-        new_local_mem_size,                                         // argSize
+        new_local_mem_a_size,                                       // argSize
         nullptr, // pProperties
         nullptr, // hArgValue
     });
@@ -583,6 +737,67 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
     // Update kernel and enqueue command-buffer again
     ASSERT_SUCCESS(
         urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+
+    std::vector<ur_exp_command_buffer_update_value_arg_desc_t>
+        second_update_value_args{};
+
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    // New local_mem_b at index 1
+    second_update_value_args.push_back({
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        new_local_mem_b_size,                                       // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    });
+
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            5,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            6,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+        second_update_value_args.push_back({
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+            nullptr,                                                    // pNext
+            7,                      // argIndex
+            sizeof(new_local_size), // argSize
+            nullptr,                // pProperties
+            &new_local_size,        // hArgValue
+        });
+    }
+
+    ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel, // hNewKernel
+        0,      // numNewMemObjArgs
+        0,      // numNewPointerArgs
+        static_cast<uint32_t>(
+            second_update_value_args.size()), // numNewValueArgs
+        n_dimensions,                         // newWorkDim
+        nullptr,                              // pNewMemObjArgList
+        nullptr,                              // pNewPointerArgList
+        second_update_value_args.data(),      // pNewValueArgList
+        nullptr,                              // pNewGlobalWorkOffset
+        nullptr,                              // pNewGlobalWorkSize
+        nullptr,                              // pNewLocalWorkSize
+    };
+    ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle,
+                                                        &second_update_desc));
+
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -597,15 +812,8 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
 struct LocalMemoryMultiUpdateTest : LocalMemoryUpdateTestBase {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBase::SetUp());
-
         // Append kernel command to command-buffer and close command-buffer
         for (unsigned node = 0; node < nodes; node++) {
-#if 1 // TODO can we remove?                                                   \
-    // We need to set the local memory arg each time because it is             \
-    // cleared in the kernel handle after being used.
-            ASSERT_SUCCESS(
-                urKernelSetArgLocal(kernel, 0, local_mem_a_size, nullptr));
-#endif
             ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
                 updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
                 &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr,

From 0b5bc826d1f128bb3f57892f60ecaddcf4f358ea Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 15 Nov 2024 11:22:41 +0000
Subject: [PATCH 032/148] Add extra more basic CTS test

---
 ...xp_command_buffer_adapter_native_cpu.match |  1 +
 .../update/local_memory_update.cpp            | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index e0543e81ed..3588eaea82 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -38,6 +38,7 @@
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/*
 {{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersSameLocalSize/*
+{{OPT}}LocalMemoryUpdateTest.UpdateLocalOnly/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersEmptyLocalSize/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersSmallerLocalSize/*
 {{OPT}}LocalMemoryUpdateTest.UpdateParametersLargerLocalSize/*
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 273f770303..2d0f24cb4e 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -16,6 +16,11 @@ struct LocalMemoryUpdateTestBase
         UUR_RETURN_ON_FATAL_FAILURE(
             urUpdatableCommandBufferExpExecutionTest::SetUp());
 
+        if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
+            GTEST_SKIP()
+                << "Local memory argument update not supported on Level Zero.";
+        }
+
         // HIP has extra args for local memory so we define an offset for arg indices here for updating
         hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
         ur_device_usm_access_capability_flags_t shared_usm_flags;
@@ -241,6 +246,70 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSameLocalSize) {
     Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
 }
 
+// Test only passing local memory parameters to update with the original values.
+TEST_P(LocalMemoryUpdateTest, UpdateLocalOnly) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Update inputs
+    std::array<ur_exp_command_buffer_update_value_arg_desc_t, 2>
+        new_value_descs;
+
+    // New local_mem_a at index 0
+    new_value_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        0,                                                          // argIndex
+        local_mem_a_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // New local_mem_b at index 1
+    new_value_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1 + hip_arg_offset,                                         // argIndex
+        local_mem_b_size,                                           // argSize
+        nullptr, // pProperties
+        nullptr, // hArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        kernel,                 // hNewKernel
+        0,                      // numNewMemObjArgs
+        0,                      // numNewPointerArgs
+        new_value_descs.size(), // numNewValueArgs
+        n_dimensions,           // newWorkDim
+        nullptr,                // pNewMemObjArgList
+        nullptr,                // pNewPointerArgList
+        new_value_descs.data(), // pNewValueArgList
+        nullptr,                // pNewGlobalWorkOffset
+        nullptr,                // pNewGlobalWorkSize
+        nullptr,                // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    Validate(output, X, Y, A, global_size, local_size);
+}
+
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {

From a5d2bda6e0936059b5a6246c67540bf1ab2973eb Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan.cr@gmail.com>
Date: Tue, 19 Nov 2024 08:56:29 +0000
Subject: [PATCH 033/148] FIx comment typos

Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
Co-authored-by: aarongreig <aaron.greig@codeplay.com>
---
 source/adapters/cuda/kernel.hpp                       |  2 +-
 source/adapters/hip/kernel.hpp                        |  2 +-
 .../exp_command_buffer/update/local_memory_update.cpp | 11 ++++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 31088da7cc..f0a554cfce 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -167,7 +167,7 @@ struct ur_kernel_handle_t_ {
              AlignedLocalSize);
 
       // For every existing local argument which follows at later argument
-      // indices, updated the offset and pointer into the kernel local memory.
+      // indices, update the offset and pointer into the kernel local memory.
       // Required as padding will need to be recalculated.
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
       for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 08441e288f..141b1433fe 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -162,7 +162,7 @@ struct ur_kernel_handle_t_ {
              AlignedLocalSize);
 
       // For every existing local argument which follows at later argument
-      // indices, updated the offset and pointer into the kernel local memory.
+      // indices, update the offset and pointer into the kernel local memory.
       // Required as padding will need to be recalculated.
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
       for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 2d0f24cb4e..60c55d4a4c 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -21,7 +21,8 @@ struct LocalMemoryUpdateTestBase
                 << "Local memory argument update not supported on Level Zero.";
         }
 
-        // HIP has extra args for local memory so we define an offset for arg indices here for updating
+        // HIP has extra args for local memory so we define an offset for arg
+        // indices here for updating
         hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
         ur_device_usm_access_capability_flags_t shared_usm_flags;
         ASSERT_SUCCESS(
@@ -313,7 +314,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateLocalOnly) {
 // Test updating A,X,Y parameters to new values and omitting local memory parameters
 // from the update.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -547,7 +548,7 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
 // Test updating A,X,Y parameters to new values and local memory parameters
 // to new larger values.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -700,10 +701,10 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
 }
 
 // Test updating A,X,Y parameters to new values and only one of the local memory
-// parameters, which is set to a new values. Then a separate update call for
+// parameters, which is set to a new value. Then a separate update call for
 // the other local memory argument.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
-    // Run command-buffer prior to update an verify output
+    // Run command-buffer prior to update and verify output
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));

From 7d126f30e830fb484ddff78fba8241abf3cc7bb5 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Tue, 19 Nov 2024 14:02:50 +0000
Subject: [PATCH 034/148] Add non command-buffer test

---
 scripts/core/CUDA.rst                         |  31 +++
 scripts/core/HIP.rst                          |  10 +
 .../kernel/kernel_adapter_native_cpu.match    |   3 +
 .../kernel/urKernelSetArgLocal.cpp            | 200 ++++++++++++++++++
 4 files changed, 244 insertions(+)

diff --git a/scripts/core/CUDA.rst b/scripts/core/CUDA.rst
index 9771693113..d23d217ef4 100644
--- a/scripts/core/CUDA.rst
+++ b/scripts/core/CUDA.rst
@@ -148,6 +148,36 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The CUDA adapter implements local memory arguments to a kernel as a single
+``__shared__`` memory allocation, with each local address space pointer argument
+to the kernel converted to a byte offset parameter to the single memory
+allocation. Therefore for ``N`` local arguments that need set on a kernel with
+``urKernelSetArgLocal``, the total aligned size is calculated for the single
+memory allocation by the CUDA adapter and passed as the ``sharedMemBytes``
+argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel``
+or ``cudaGraphAddKernelNode``).
+
+For each kernel local memory parameter, aligned offsets into the single memory location
+are calculated and passed at runtime via ``kernelParams`` when launching the kernel (or
+adding as a graph node). When a user calls ``urKernelSetArgLocal`` with an
+argument index that has already been set the CUDA adapter recalculates the size of the
+single memory allocation and offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The CUDA UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
 Other Notes
 ===========
 
@@ -164,4 +194,5 @@ Contributors
 ------------
 
 * Hugh Delaney `hugh.delaney@codeplay.com <hugh.delaney@codeplay.com>`_
+* Ewan Crawford `ewan@codeplay.com <ewan@codeplay.com>`_
 
diff --git a/scripts/core/HIP.rst b/scripts/core/HIP.rst
index 3ded0138ff..32d387c481 100644
--- a/scripts/core/HIP.rst
+++ b/scripts/core/HIP.rst
@@ -91,6 +91,15 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+.. todo::
+   Copy and update CUDA doc
+
+.. todo::
+   Document what extra args needed on HIP arg with local accessors
+
 Other Notes
 ===========
 
@@ -100,4 +109,5 @@ Contributors
 ------------
 
 * Hugh Delaney `hugh.delaney@codeplay.com <hugh.delaney@codeplay.com>`_
+* Ewan Crawford `ewan@codeplay.com <ewan@codeplay.com>`_
 
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index 7ca10ec3d2..bd5333c609 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -38,6 +38,9 @@ urKernelRetainTest.InvalidNullHandleKernel/*
 urKernelSetArgLocalTest.Success/*
 urKernelSetArgLocalTest.InvalidNullHandleKernel/*
 urKernelSetArgLocalTest.InvalidKernelArgumentIndex/*
+urKernelSetArgLocalMultiTest.Basic/*
+urKernelSetArgLocalMultiTest.ReLaunch/*
+urKernelSetArgLocalMultiTest.Overwrite/*
 urKernelSetArgMemObjTest.Success/*
 urKernelSetArgMemObjTest.InvalidNullHandleKernel/*
 urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/*
diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp
index 1d3789bf3a..fcbd5194a7 100644
--- a/test/conformance/kernel/urKernelSetArgLocal.cpp
+++ b/test/conformance/kernel/urKernelSetArgLocal.cpp
@@ -3,6 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <cstring>
 #include <uur/fixtures.h>
 
 struct urKernelSetArgLocalTest : uur::urKernelTest {
@@ -32,3 +33,202 @@ TEST_P(urKernelSetArgLocalTest, InvalidKernelArgumentIndex) {
                      urKernelSetArgLocal(kernel, num_kernel_args + 1,
                                          local_mem_size, nullptr));
 }
+
+// Test launching kernels with multiple local arguments return the expected
+// outputs
+struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "saxpy_usm_local_mem";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+
+        ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                         sizeof(backend), &backend, nullptr));
+
+        // HIP has extra args for local memory so we define an offset for arg indices here for updating
+        hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
+        ur_device_usm_access_capability_flags_t shared_usm_flags;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        const size_t allocation_size =
+            sizeof(uint32_t) * global_size * local_size;
+        for (auto &shared_ptr : shared_ptrs) {
+            ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                            allocation_size, &shared_ptr));
+            ASSERT_NE(shared_ptr, nullptr);
+
+            std::vector<uint8_t> pattern(allocation_size);
+            uur::generateMemFillPattern(pattern);
+            std::memcpy(shared_ptr, pattern.data(), allocation_size);
+        }
+        size_t current_index = 0;
+        // Index 0 is local_mem_a arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_a_size, nullptr));
+
+        // Hip has extra args for local mem at index 1-3
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 1 is local_mem_b arg
+        ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                           local_mem_b_size, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                               sizeof(local_size), nullptr,
+                                               &local_size));
+        }
+
+        // Index 2 is output
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[0]));
+        // Index 3 is A
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++, sizeof(A),
+                                           nullptr, &A));
+        // Index 4 is X
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[1]));
+        // Index 5 is Y
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, current_index++, nullptr,
+                                             shared_ptrs[2]));
+    }
+
+    void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
+                  size_t length, size_t local_size) {
+        for (size_t i = 0; i < length; i++) {
+            uint32_t result = A * X[i] + Y[i] + local_size;
+            ASSERT_EQ(result, output[i]);
+        }
+    }
+
+    virtual void TearDown() override {
+        for (auto &shared_ptr : shared_ptrs) {
+            if (shared_ptr) {
+                EXPECT_SUCCESS(urUSMFree(context, shared_ptr));
+            }
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown());
+    }
+
+    static constexpr size_t local_size = 4;
+    static constexpr size_t local_mem_a_size = local_size * sizeof(uint32_t);
+    static constexpr size_t local_mem_b_size = local_mem_a_size * 2;
+    static constexpr size_t global_size = 16;
+    static constexpr size_t global_offset = 0;
+    static constexpr size_t n_dimensions = 1;
+    static constexpr uint32_t A = 42;
+    std::array<void *, 5> shared_ptrs = {nullptr, nullptr, nullptr, nullptr,
+                                         nullptr};
+
+    uint32_t hip_arg_offset = 0;
+    ur_platform_backend_t backend{};
+};
+UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetArgLocalMultiTest);
+
+TEST_P(urKernelSetArgLocalMultiTest, Basic) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+}
+
+TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    // Relaunch with new arguments
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, A, global_size, local_size);
+}
+
+// Overwrite local args to a larger value, then reset back to original
+TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size, local_size);
+
+    size_t new_local_size = 2;
+    size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);
+    size_t new_local_mem_b_size = new_local_size * sizeof(uint32_t) * 2;
+    size_t current_index = 0;
+    ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                       new_local_mem_a_size, nullptr));
+
+    // Hip has extra args for local mem at index 1-3
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+    }
+
+    // Index 1 is local_mem_b arg
+    ASSERT_SUCCESS(urKernelSetArgLocal(kernel, current_index++,
+                                       new_local_mem_b_size, nullptr));
+    if (backend == UR_PLATFORM_BACKEND_HIP) {
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
+                                           sizeof(new_local_size), nullptr,
+                                           &new_local_size));
+    }
+
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                         &global_offset, &global_size,
+                                         &new_local_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    Validate(output, X, Y, A, global_size, new_local_size);
+}

From e578228aa259c84f0c909b2993ca6659a0dfcd86 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 20 Nov 2024 12:41:46 +0000
Subject: [PATCH 035/148] Document hip extra arg behavior

---
 scripts/core/CUDA.rst                         |  27 +--
 scripts/core/HIP.rst                          |  41 ++++-
 source/adapters/cuda/kernel.hpp               |   6 +-
 source/adapters/hip/kernel.hpp                |   6 +-
 .../update/local_memory_update.cpp            | 169 +++++++++---------
 .../kernel/urKernelSetArgLocal.cpp            |  49 ++---
 6 files changed, 165 insertions(+), 133 deletions(-)

diff --git a/scripts/core/CUDA.rst b/scripts/core/CUDA.rst
index d23d217ef4..08b61bf9dc 100644
--- a/scripts/core/CUDA.rst
+++ b/scripts/core/CUDA.rst
@@ -157,25 +157,28 @@ space pointer arguments, which are set by the user with
 ``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
 and make available from the pointer argument.
 
-The CUDA adapter implements local memory arguments to a kernel as a single
-``__shared__`` memory allocation, with each local address space pointer argument
-to the kernel converted to a byte offset parameter to the single memory
-allocation. Therefore for ``N`` local arguments that need set on a kernel with
-``urKernelSetArgLocal``, the total aligned size is calculated for the single
+The CUDA adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
 memory allocation by the CUDA adapter and passed as the ``sharedMemBytes``
 argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel``
-or ``cudaGraphAddKernelNode``).
+or ``cuGraphAddKernelNode``).
 
-For each kernel local memory parameter, aligned offsets into the single memory location
-are calculated and passed at runtime via ``kernelParams`` when launching the kernel (or
-adding as a graph node). When a user calls ``urKernelSetArgLocal`` with an
-argument index that has already been set the CUDA adapter recalculates the size of the
-single memory allocation and offsets of any local memory arguments at following indices.
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
 
 .. warning::
 
   The CUDA UR adapter implementation of local memory assumes the kernel created
-  has been created by DPC++, instumenting the device code so that local memory
+  has been created by DPC++, instrumenting the device code so that local memory
   arguments are offsets rather than pointers.
 
 Other Notes
diff --git a/scripts/core/HIP.rst b/scripts/core/HIP.rst
index 32d387c481..920a5f5a3e 100644
--- a/scripts/core/HIP.rst
+++ b/scripts/core/HIP.rst
@@ -94,11 +94,42 @@ the user does not wish to use the global offset.
 Local Memory Arguments
 ----------------------
 
-.. todo::
-   Copy and update CUDA doc
-
-.. todo::
-   Document what extra args needed on HIP arg with local accessors
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The HIP adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
+memory allocation by the HIP adapter and passed as the ``sharedMemBytes``
+argument to ``hipModuleLaunchKernel`` or ``hipGraphAddKernelNode``.
+
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The HIP UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instrumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
+
+HIP kernels that are generated for DPC++ kernels with SYCL local accessors
+contain extra value arguments on top of the local memory argument for the
+local accessor. For each ``urKernelSetArgLocal`` argument, a user needs
+to make 3 calls to ``urKernelSetArgValue`` with each of the next 3 consecutive
+argument indexes. This represents a 3 dimensional offset into the local
+accessor.
 
 Other Notes
 ===========
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index f0a554cfce..2b04dfba43 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -158,8 +158,7 @@ struct ur_kernel_handle_t_ {
 
     void addLocalArg(size_t Index, size_t Size) {
       // Get the aligned argument size and offset into local data
-      size_t AlignedLocalSize, AlignedLocalOffset;
-      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+      auto [AlignedLocalSize, AlignedLocalOffset] =
           calcAlignedLocalArgument(Index, Size);
 
       // Store argument details
@@ -178,8 +177,7 @@ struct ur_kernel_handle_t_ {
         }
 
         // Recalculate alignment
-        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
-        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+        auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] =
             calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
 
         // Store new local memory size
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 141b1433fe..c6d30e81ad 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -153,8 +153,7 @@ struct ur_kernel_handle_t_ {
 
     void addLocalArg(size_t Index, size_t Size) {
       // Get the aligned argument size and offset into local data
-      size_t AlignedLocalSize, AlignedLocalOffset;
-      std::tie(AlignedLocalSize, AlignedLocalOffset) =
+      auto [AlignedLocalSize, AlignedLocalOffset] =
           calcAlignedLocalArgument(Index, Size);
 
       // Store argument details
@@ -173,8 +172,7 @@ struct ur_kernel_handle_t_ {
         }
 
         // Recalculate alignment
-        size_t SuccAlignedLocalSize, SuccAlignedLocalOffset;
-        std::tie(SuccAlignedLocalSize, SuccAlignedLocalOffset) =
+        auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] =
             calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
 
         // Store new local memory size
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 60c55d4a4c..c467c9783a 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -50,14 +50,14 @@ struct LocalMemoryUpdateTestBase
         // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 1 is local_mem_b arg
@@ -65,14 +65,14 @@ struct LocalMemoryUpdateTestBase
                                            local_mem_b_size, nullptr));
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 2 is output
@@ -119,6 +119,7 @@ struct LocalMemoryUpdateTestBase
                                          nullptr};
 
     uint32_t hip_arg_offset = 0;
+    static constexpr uint64_t hip_local_offset = 0;
 };
 
 struct LocalMemoryUpdateTest : LocalMemoryUpdateTestBase {
@@ -424,26 +425,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -462,26 +463,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -579,26 +580,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -617,26 +618,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersLargerLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -735,26 +736,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            1,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            1,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            2,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            2,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         new_value_descs.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            3,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            3,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
@@ -826,26 +827,26 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersPartialLocalSize) {
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            5,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            5,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            6,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            6,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
         second_update_value_args.push_back({
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
             nullptr,                                                    // pNext
-            7,                      // argIndex
-            sizeof(new_local_size), // argSize
-            nullptr,                // pProperties
-            &new_local_size,        // hArgValue
+            7,                        // argIndex
+            sizeof(hip_local_offset), // argSize
+            nullptr,                  // pProperties
+            &hip_local_offset,        // hArgValue
         });
     }
 
diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp
index fcbd5194a7..380085bd16 100644
--- a/test/conformance/kernel/urKernelSetArgLocal.cpp
+++ b/test/conformance/kernel/urKernelSetArgLocal.cpp
@@ -72,14 +72,14 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
         // Hip has extra args for local mem at index 1-3
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 1 is local_mem_b arg
@@ -87,14 +87,14 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
                                            local_mem_b_size, nullptr));
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
             ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                               sizeof(local_size), nullptr,
-                                               &local_size));
+                                               sizeof(hip_local_offset),
+                                               nullptr, &hip_local_offset));
         }
 
         // Index 2 is output
@@ -140,6 +140,7 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest {
                                          nullptr};
 
     uint32_t hip_arg_offset = 0;
+    static constexpr uint64_t hip_local_offset = 0;
     ur_platform_backend_t backend{};
 };
 UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetArgLocalMultiTest);
@@ -200,14 +201,14 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
     // Hip has extra args for local mem at index 1-3
     if (backend == UR_PLATFORM_BACKEND_HIP) {
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
     }
 
     // Index 1 is local_mem_b arg
@@ -215,14 +216,14 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) {
                                        new_local_mem_b_size, nullptr));
     if (backend == UR_PLATFORM_BACKEND_HIP) {
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
         ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_index++,
-                                           sizeof(new_local_size), nullptr,
-                                           &new_local_size));
+                                           sizeof(hip_local_offset), nullptr,
+                                           &hip_local_offset));
     }
 
     ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,

From 8412c8bdfbf8262d7ee6d953a85cf956bb50c887 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 21 Nov 2024 16:12:45 +0000
Subject: [PATCH 036/148] Fix "use after release" issues

In some cases, we use handles after releasing them, or incorrectly
release handles we shouldn't. This doesn't cause any issues currently,
but will when we start using reference counting in the loader.
---
 scripts/templates/valddi.cpp.mako             | 20 +++--
 source/loader/layers/validation/ur_valddi.cpp | 88 +++++++++----------
 test/conformance/adapter/urAdapterRelease.cpp |  1 +
 test/conformance/device/urDeviceRelease.cpp   |  2 +
 .../testing/include/uur/fixtures.h            |  1 +
 5 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/scripts/templates/valddi.cpp.mako b/scripts/templates/valddi.cpp.mako
index 8cc4a9dc0f..7a18860ba9 100644
--- a/scripts/templates/valddi.cpp.mako
+++ b/scripts/templates/valddi.cpp.mako
@@ -94,6 +94,19 @@ namespace ur_validation_layer
             %endif
             %endfor
 
+        %for tp in tracked_params:
+        <%
+            tp_handle_funcs = next((hf for hf in handle_create_get_retain_release_funcs if th.subt(n, tags, tp['type']) in [hf['handle'], hf['handle'] + "*"]), None)
+            is_handle_to_adapter = ("_adapter_handle_t" in tp['type'])
+        %>
+        %if func_name in tp_handle_funcs['release']:
+        if( getContext()->enableLeakChecking )
+        {
+            getContext()->refCountContext->decrementRefCount(${tp['name']}, ${str(is_handle_to_adapter).lower()});
+        }
+        %endif
+        %endfor
+
         ${x}_result_t result = ${th.make_pfn_name(n, tags, obj)}( ${", ".join(th.make_param_lines(n, tags, obj, format=["name"]))} );
 
         %for tp in tracked_params:
@@ -114,15 +127,10 @@ namespace ur_validation_layer
             }
         }
         %elif func_name in tp_handle_funcs['retain']:
-        if( getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS )
+        if( getContext()->enableLeakChecking )
         {
             getContext()->refCountContext->incrementRefCount(${tp['name']}, ${str(is_handle_to_adapter).lower()});
         }
-        %elif func_name in tp_handle_funcs['release']:
-        if( getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS )
-        {
-            getContext()->refCountContext->decrementRefCount(${tp['name']}, ${str(is_handle_to_adapter).lower()});
-        }
         %endif
         %endfor
 
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..26173a6d14 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -71,12 +71,12 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRelease(
         }
     }
 
-    ur_result_t result = pfnAdapterRelease(hAdapter);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hAdapter, true);
     }
 
+    ur_result_t result = pfnAdapterRelease(hAdapter);
+
     return result;
 }
 
@@ -99,7 +99,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRetain(
 
     ur_result_t result = pfnAdapterRetain(hAdapter);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hAdapter, true);
     }
 
@@ -558,7 +558,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRetain(
 
     ur_result_t result = pfnRetain(hDevice);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hDevice, false);
     }
 
@@ -583,12 +583,12 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hDevice);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hDevice, false);
     }
 
+    ur_result_t result = pfnRelease(hDevice);
+
     return result;
 }
 
@@ -861,7 +861,7 @@ __urdlllocal ur_result_t UR_APICALL urContextRetain(
 
     ur_result_t result = pfnRetain(hContext);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hContext, false);
     }
 
@@ -886,12 +886,12 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hContext);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hContext, false);
     }
 
+    ur_result_t result = pfnRelease(hContext);
+
     return result;
 }
 
@@ -1248,7 +1248,7 @@ __urdlllocal ur_result_t UR_APICALL urMemRetain(
 
     ur_result_t result = pfnRetain(hMem);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hMem, false);
     }
 
@@ -1273,12 +1273,12 @@ __urdlllocal ur_result_t UR_APICALL urMemRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hMem);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hMem, false);
     }
 
+    ur_result_t result = pfnRelease(hMem);
+
     return result;
 }
 
@@ -1657,7 +1657,7 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRetain(
 
     ur_result_t result = pfnRetain(hSampler);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hSampler, false);
     }
 
@@ -1682,12 +1682,12 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hSampler);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hSampler, false);
     }
 
+    ur_result_t result = pfnRelease(hSampler);
+
     return result;
 }
 
@@ -2154,7 +2154,7 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRetain(
 
     ur_result_t result = pfnPoolRetain(pPool);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(pPool, false);
     }
 
@@ -2178,12 +2178,12 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRelease(
         }
     }
 
-    ur_result_t result = pfnPoolRelease(pPool);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(pPool, false);
     }
 
+    ur_result_t result = pfnPoolRelease(pPool);
+
     return result;
 }
 
@@ -2631,7 +2631,7 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRetain(
 
     ur_result_t result = pfnRetain(hPhysicalMem);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hPhysicalMem, false);
     }
 
@@ -2656,12 +2656,12 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hPhysicalMem);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hPhysicalMem, false);
     }
 
+    ur_result_t result = pfnRelease(hPhysicalMem);
+
     return result;
 }
 
@@ -2952,7 +2952,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain(
 
     ur_result_t result = pfnRetain(hProgram);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hProgram, false);
     }
 
@@ -2977,12 +2977,12 @@ __urdlllocal ur_result_t UR_APICALL urProgramRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hProgram);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hProgram, false);
     }
 
+    ur_result_t result = pfnRelease(hProgram);
+
     return result;
 }
 
@@ -3618,7 +3618,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain(
 
     ur_result_t result = pfnRetain(hKernel);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hKernel, false);
     }
 
@@ -3643,12 +3643,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hKernel);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hKernel, false);
     }
 
+    ur_result_t result = pfnRelease(hKernel);
+
     return result;
 }
 
@@ -4138,7 +4138,7 @@ __urdlllocal ur_result_t UR_APICALL urQueueRetain(
 
     ur_result_t result = pfnRetain(hQueue);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hQueue, false);
     }
 
@@ -4163,12 +4163,12 @@ __urdlllocal ur_result_t UR_APICALL urQueueRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hQueue);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hQueue, false);
     }
 
+    ur_result_t result = pfnRelease(hQueue);
+
     return result;
 }
 
@@ -4454,7 +4454,7 @@ __urdlllocal ur_result_t UR_APICALL urEventRetain(
 
     ur_result_t result = pfnRetain(hEvent);
 
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->incrementRefCount(hEvent, false);
     }
 
@@ -4478,12 +4478,12 @@ __urdlllocal ur_result_t UR_APICALL urEventRelease(
         }
     }
 
-    ur_result_t result = pfnRelease(hEvent);
-
-    if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) {
+    if (getContext()->enableLeakChecking) {
         getContext()->refCountContext->decrementRefCount(hEvent, false);
     }
 
+    ur_result_t result = pfnRelease(hEvent);
+
     return result;
 }
 
diff --git a/test/conformance/adapter/urAdapterRelease.cpp b/test/conformance/adapter/urAdapterRelease.cpp
index 8b29fa0f2c..0b28287aa7 100644
--- a/test/conformance/adapter/urAdapterRelease.cpp
+++ b/test/conformance/adapter/urAdapterRelease.cpp
@@ -16,6 +16,7 @@ struct urAdapterReleaseTest : uur::runtime::urAdapterTest {
 
 TEST_F(urAdapterReleaseTest, Success) {
     uint32_t referenceCountBefore = 0;
+    ASSERT_SUCCESS(urAdapterRetain(adapter));
 
     ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT,
                                     sizeof(referenceCountBefore),
diff --git a/test/conformance/device/urDeviceRelease.cpp b/test/conformance/device/urDeviceRelease.cpp
index a8f6a3bc9d..dd5510394f 100644
--- a/test/conformance/device/urDeviceRelease.cpp
+++ b/test/conformance/device/urDeviceRelease.cpp
@@ -8,6 +8,8 @@ struct urDeviceReleaseTest : uur::urAllDevicesTest {};
 
 TEST_F(urDeviceReleaseTest, Success) {
     for (auto device : devices) {
+        ASSERT_SUCCESS(urDeviceRetain(device));
+
         uint32_t prevRefCount = 0;
         ASSERT_SUCCESS(uur::GetObjectReferenceCount(device, prevRefCount));
 
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 1900568292..b1c90883d8 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -95,6 +95,7 @@ struct urDeviceTest : urPlatformTest,
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp());
         device = GetParam();
+        EXPECT_SUCCESS(urDeviceRetain(device));
     }
 
     void TearDown() override {

From 850e85c7770cdc6fbd603c9d275f47ebbdfa1205 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Mon, 2 Dec 2024 15:25:59 +0000
Subject: [PATCH 037/148] Fix L0 match entry for
 urEnqueueEventsWaitMultiDeviceMTTest

The match entry for `urEnqueueEventsWaitMultiDeviceMTTest` previously
matches `urEnqueueEventsWaitMultiDeviceMTTest/*` but this doesn't
actually match the test names which are of the form
`urEnqueueEventsWaitMultiDeviceMTTest.<TestName>/...`. This patch
removes the `/` in order to match all tests using this fixture name.

Failures like the following have been occurring in unrelated PR's today:

```
[ RUN      ] urEnqueueEventsWaitMultiDeviceMTTest.EnqueueWaitSingleQueueMultiOps/MultiThread
/home/test-user/actions-runner/_work/unified-runtime/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp:54: Failure
Expected equality of these values:
  reinterpret_cast<uint32_t *>(ptr)[i]
    Which is: 0
  pattern
    Which is: 42
[  FAILED  ] urEnqueueEventsWaitMultiDeviceMTTest.EnqueueWaitSingleQueueMultiOps/MultiThread, where GetParam() = 40-byte object
```
---
 test/conformance/enqueue/enqueue_adapter_level_zero.match | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match
index 7e3405a963..4155859eaf 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match
@@ -20,7 +20,7 @@
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin1D/*
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin2D/*
 {{OPT}}urEnqueueMemImageReadTest.InvalidOrigin3D/*
-{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest/*
+{{OPT}}urEnqueueEventsWaitMultiDeviceMTTest*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependencies/*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesBarrierOnly/*
 {{OPT}}urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesLaunchOnly/*

From 974d09975994c83522bb6d2bf5eacdd5f1f51cb9 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Tue, 29 Oct 2024 16:33:30 +0000
Subject: [PATCH 038/148] Explicitly specify USMFree as a blocking operation.

Also add test to enforce this.
---
 include/ur_api.h                              |  5 ++
 scripts/core/usm.yml                          |  2 +
 source/loader/ur_libapi.cpp                   |  5 ++
 source/ur_api.cpp                             |  5 ++
 test/conformance/CMakeLists.txt               |  2 +-
 test/conformance/usm/CMakeLists.txt           |  2 +-
 test/conformance/usm/urUSMFree.cpp            | 81 +++++++++++++++++++
 .../usm/usm_adapter_native_cpu.match          |  1 +
 8 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..08ca0de32f 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -3661,6 +3661,11 @@ urUSMSharedAlloc(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Free the USM memory object
 ///
+/// @details
+///     - Note that implementations are required to wait for previously enqueued
+///       commands that may be accessing `pMem` to finish before freeing the
+///       memory.
+///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
diff --git a/scripts/core/usm.yml b/scripts/core/usm.yml
index 77d5b7260a..db112ed8eb 100644
--- a/scripts/core/usm.yml
+++ b/scripts/core/usm.yml
@@ -364,6 +364,8 @@ desc: "Free the USM memory object"
 class: $xUSM
 name: Free
 ordinal: "0"
+details:
+  - "Note that implementations are required to wait for previously enqueued commands that may be accessing `pMem` to finish before freeing the memory."
 params:
     - type: $x_context_handle_t
       name: hContext
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..7ec4148736 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -2404,6 +2404,11 @@ ur_result_t UR_APICALL urUSMSharedAlloc(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Free the USM memory object
 ///
+/// @details
+///     - Note that implementations are required to wait for previously enqueued
+///       commands that may be accessing `pMem` to finish before freeing the
+///       memory.
+///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..a33a4caddc 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -2080,6 +2080,11 @@ ur_result_t UR_APICALL urUSMSharedAlloc(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Free the USM memory object
 ///
+/// @details
+///     - Note that implementations are required to wait for previously enqueued
+///       commands that may be accessing `pMem` to finish before freeing the
+///       memory.
+///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index cfc4725837..e71a829964 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -112,7 +112,6 @@ add_subdirectory(platform)
 add_subdirectory(device)
 add_subdirectory(context)
 add_subdirectory(memory)
-add_subdirectory(usm)
 add_subdirectory(event)
 add_subdirectory(queue)
 add_subdirectory(sampler)
@@ -129,6 +128,7 @@ set(TEST_SUBDIRECTORIES_DPCXX
     "exp_usm_p2p"
     "exp_launch_properties"
     "memory-migrate"
+    "usm"
 )
 
 if(UR_DPCXX)
diff --git a/test/conformance/usm/CMakeLists.txt b/test/conformance/usm/CMakeLists.txt
index 8a7ae23fed..f197447cbf 100644
--- a/test/conformance/usm/CMakeLists.txt
+++ b/test/conformance/usm/CMakeLists.txt
@@ -3,7 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-add_conformance_test_with_devices_environment(usm 
+add_conformance_test_with_kernels_environment(usm
     urUSMDeviceAlloc.cpp
     urUSMFree.cpp
     urUSMGetMemAllocInfo.cpp
diff --git a/test/conformance/usm/urUSMFree.cpp b/test/conformance/usm/urUSMFree.cpp
index f5502c89a6..ca5f0c225c 100644
--- a/test/conformance/usm/urUSMFree.cpp
+++ b/test/conformance/usm/urUSMFree.cpp
@@ -95,3 +95,84 @@ TEST_P(urUSMFreeTest, InvalidNullPtrMem) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urUSMFree(context, nullptr));
 }
+
+// This goal of this test is to ensure urUSMFree blocks and waits for operations
+// accessing the given allocation to finish before actually freeing the memory.
+struct urUSMFreeDuringExecutionTest : uur::urKernelExecutionTest {
+    void SetUp() {
+        program_name = "fill_usm";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+
+    void *allocation = nullptr;
+    size_t array_size = 256;
+    size_t allocation_size = array_size * sizeof(uint32_t);
+    uint32_t data = 42;
+    size_t wg_offset = 0;
+};
+UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urUSMFreeDuringExecutionTest);
+
+TEST_P(urUSMFreeDuringExecutionTest, SuccessHost) {
+    ur_device_usm_access_capability_flags_t host_usm_flags = 0;
+    ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, host_usm_flags));
+    if (!(host_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+        GTEST_SKIP() << "Host USM is not supported.";
+    }
+
+    ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size,
+                                  &allocation));
+    ASSERT_NE(allocation, nullptr);
+
+    EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation));
+    EXPECT_SUCCESS(
+        urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data));
+    EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset,
+                                         &array_size, nullptr, 0, nullptr,
+                                         nullptr));
+    ASSERT_SUCCESS(urUSMFree(context, allocation));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+}
+
+TEST_P(urUSMFreeDuringExecutionTest, SuccessDevice) {
+    ur_device_usm_access_capability_flags_t device_usm_flags = 0;
+    ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm_flags));
+    if (!(device_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+        GTEST_SKIP() << "Device USM is not supported.";
+    }
+
+    ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                    allocation_size, &allocation));
+    ASSERT_NE(allocation, nullptr);
+
+    EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation));
+    EXPECT_SUCCESS(
+        urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data));
+
+    EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset,
+                                         &array_size, nullptr, 0, nullptr,
+                                         nullptr));
+    ASSERT_SUCCESS(urUSMFree(context, allocation));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+}
+
+TEST_P(urUSMFreeDuringExecutionTest, SuccessShared) {
+    ur_device_usm_access_capability_flags_t shared_usm_flags = 0;
+    ASSERT_SUCCESS(
+        uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+    if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+        GTEST_SKIP() << "Shared USM is not supported.";
+    }
+
+    ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                    allocation_size, &allocation));
+    ASSERT_NE(allocation, nullptr);
+
+    EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation));
+    EXPECT_SUCCESS(
+        urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data));
+    EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset,
+                                         &array_size, nullptr, 0, nullptr,
+                                         nullptr));
+    ASSERT_SUCCESS(urUSMFree(context, allocation));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+}
diff --git a/test/conformance/usm/usm_adapter_native_cpu.match b/test/conformance/usm/usm_adapter_native_cpu.match
index 6ef26e2bdf..603a25b3e2 100644
--- a/test/conformance/usm/usm_adapter_native_cpu.match
+++ b/test/conformance/usm/usm_adapter_native_cpu.match
@@ -3,6 +3,7 @@ urUSMDeviceAllocTest.InvalidUSMSize/*__UsePoolDisabled
 urUSMFreeTest.SuccessDeviceAlloc/*
 urUSMFreeTest.SuccessHostAlloc/*
 urUSMFreeTest.SuccessSharedAlloc/*
+urUSMFreeDuringExecutionTest.*
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_TYPE
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_BASE_PTR
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_SIZE

From b7b6b55769c2f4e8ffb077a9f945e371232148f2 Mon Sep 17 00:00:00 2001
From: "Larsen, Steffen" <steffen.larsen@intel.com>
Date: Mon, 2 Dec 2024 08:34:30 -0800
Subject: [PATCH 039/148] [L0] Fix cached and evicted timestamp recordings

This commit fixes two issues with the level zero implementation of
timestamp recording events:
* Events allocated for timestamp recordings may have been previously
  used, which may lead the implementation to think that the recordings
  of the old timestamp are right. The implementation will now reset the
  value of it.
* To avoid cases where timestamp recordings could conflict in the
  recordings buffer, unfinished recordings of dead events are now moved
  to another map, to be evicted fully on queue synchronization or death.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 source/adapters/level_zero/event.cpp | 20 +++++++++---------
 source/adapters/level_zero/queue.cpp | 31 ++++++++++++++--------------
 source/adapters/level_zero/queue.hpp | 15 ++++++--------
 3 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 96da4be0fd..f7472ac325 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -574,8 +574,7 @@ ur_result_t urEventGetProfilingInfo(
 
       // End time needs to be adjusted for resolution and valid bits.
       uint64_t ContextEndTime =
-          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
-          ZeTimerResolution;
+          (EndTimeRecording & TimestampMaxValue) * ZeTimerResolution;
 
       // If the result is 0, we have not yet gotten results back and so we just
       // return it.
@@ -748,20 +747,20 @@ ur_result_t urEnqueueTimestampRecordingExp(
   ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent;
   (*OutEvent)->WaitList = TmpWaitList;
 
+  // Reset the end timestamp, in case it has been previously used.
+  (*OutEvent)->RecordEventEndTimestamp = 0;
+
   uint64_t DeviceStartTimestamp = 0;
   UR_CALL(ur::level_zero::urDeviceGetGlobalTimestamps(
       Device, &DeviceStartTimestamp, nullptr));
   (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp;
 
   // Create a new entry in the queue's recordings.
-  Queue->EndTimeRecordings[*OutEvent] =
-      ur_queue_handle_t_::end_time_recording{};
+  Queue->EndTimeRecordings[*OutEvent] = 0;
 
   ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp,
-             (CommandList->first,
-              &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp,
-              ZeEvent, (*OutEvent)->WaitList.Length,
-              (*OutEvent)->WaitList.ZeEventList));
+             (CommandList->first, &Queue->EndTimeRecordings[*OutEvent], ZeEvent,
+              (*OutEvent)->WaitList.Length, (*OutEvent)->WaitList.ZeEventList));
 
   UR_CALL(
       Queue->executeCommandList(CommandList, Blocking, false /* OkToBatch */));
@@ -1089,10 +1088,11 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
     auto Entry = Queue->EndTimeRecordings.find(Event);
     if (Entry != Queue->EndTimeRecordings.end()) {
       auto &EndTimeRecording = Entry->second;
-      if (EndTimeRecording.RecordEventEndTimestamp == 0) {
+      if (EndTimeRecording == 0) {
         // If the end time recording has not finished, we tell the queue that
         // the event is no longer alive to avoid invalid write-backs.
-        EndTimeRecording.EventHasDied = true;
+        Queue->EvictedEndTimeRecordings.insert(
+            Queue->EndTimeRecordings.extract(Entry));
       } else {
         // Otherwise we evict the entry.
         Queue->EndTimeRecordings.erase(Entry);
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c4598f3472..e39786a51a 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1563,24 +1563,23 @@ void ur_queue_handle_t_::clearEndTimeRecordings() {
   for (auto Entry : EndTimeRecordings) {
     auto &Event = Entry.first;
     auto &EndTimeRecording = Entry.second;
-    if (!Entry.second.EventHasDied) {
-      // Write the result back to the event if it is not dead.
-      uint64_t ContextEndTime =
-          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
-          ZeTimerResolution;
-
-      // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
-      // Note, it will not report correct time if there were multiple wrap
-      // arounds, and the longer term plan is to enlarge the capacity of the
-      // HW timestamps.
-      if (ContextEndTime < Event->RecordEventStartTimestamp)
-        ContextEndTime += TimestampMaxValue * ZeTimerResolution;
-
-      // Store it in the event.
-      Event->RecordEventEndTimestamp = ContextEndTime;
-    }
+
+    // Write the result back to the event if it is not dead.
+    uint64_t ContextEndTime =
+        (EndTimeRecording & TimestampMaxValue) * ZeTimerResolution;
+
+    // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+    // Note, it will not report correct time if there were multiple wrap
+    // arounds, and the longer term plan is to enlarge the capacity of the
+    // HW timestamps.
+    if (ContextEndTime < Event->RecordEventStartTimestamp)
+      ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+    // Store it in the event.
+    Event->RecordEventEndTimestamp = ContextEndTime;
   }
   EndTimeRecordings.clear();
+  EvictedEndTimeRecordings.clear();
 }
 
 ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 1108e4c268..bd9a07dd5a 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -490,15 +490,12 @@ struct ur_queue_handle_t_ : _ur_object {
   // End-times enqueued are stored on the queue rather than on the event to
   // avoid the event objects having been destroyed prior to the write to the
   // end-time member.
-  struct end_time_recording {
-    // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as
-    // it is written asynchronously.
-    uint64_t RecordEventEndTimestamp = 0;
-    // The event may die before the recording has been written back. In this
-    // case the event will mark this for deletion when the queue sees fit.
-    bool EventHasDied = false;
-  };
-  std::map<ur_event_handle_t, end_time_recording> EndTimeRecordings;
+  // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as
+  // it is written asynchronously.
+  std::map<ur_event_handle_t, uint64_t> EndTimeRecordings;
+  // The event may die before the recording has been written back. In this case
+  // we move it to a separate map to avoid conflicts.
+  std::multimap<ur_event_handle_t, uint64_t> EvictedEndTimeRecordings;
 
   // Clear the end time recording timestamps entries.
   void clearEndTimeRecordings();

From acca3a2d9343edd8f986df08821cfb51f9a8733b Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 2 Dec 2024 17:56:21 +0000
Subject: [PATCH 040/148] [Benchmarks] fix label for comput benchmark

---
 scripts/benchmarks/benches/compute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index f872399e9e..c00ec2ec52 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -115,7 +115,7 @@ def run(self, env_vars) -> list[Result]:
         parsed_results = self.parse_output(result)
         ret = []
         for label, mean, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "CPU count" else ""
+            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
             ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 

From f59a842cb8d0fcc4c8bb7e3570a42181da1e5ec4 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 19 Nov 2024 09:53:06 -0800
Subject: [PATCH 041/148] [L0] Enable zesInit by default given newer L0 IP
 version

- If the user has not overwritten the sysman init type, then the adapter
  will check the device ip version to determine if zesInit should be
used and the env disabled.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/adapter.cpp | 60 +++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index f2ac67ddff..5dff84503c 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -76,6 +76,45 @@ ur_result_t getZesDeviceHandle(zes_uuid_t coreDeviceUuid,
   return UR_RESULT_ERROR_INVALID_ARGUMENT;
 }
 
+ur_result_t checkDeviceIntelGPUIpVersionOrNewer(uint32_t ipVersion) {
+  uint32_t ZeDriverCount = 0;
+  ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
+  if (ZeDriverCount == 0) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  std::vector<ze_driver_handle_t> ZeDrivers;
+  std::vector<ze_device_handle_t> ZeDevices;
+  ZeDrivers.resize(ZeDriverCount);
+
+  ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
+  for (uint32_t I = 0; I < ZeDriverCount; ++I) {
+    ze_device_properties_t device_properties{};
+    ze_device_ip_version_ext_t ipVersionExt{};
+    ipVersionExt.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+    ipVersionExt.pNext = nullptr;
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = &ipVersionExt;
+    uint32_t ZeDeviceCount = 0;
+    ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr));
+    ZeDevices.resize(ZeDeviceCount);
+    ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data()));
+    // Check if this driver has GPU Devices that have this IP Version or newer.
+    for (uint32_t D = 0; D < ZeDeviceCount; ++D) {
+      ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties));
+      if (device_properties.type == ZE_DEVICE_TYPE_GPU &&
+          device_properties.vendorId == 0x8086) {
+        ze_device_ip_version_ext_t *ipVersionExt =
+            (ze_device_ip_version_ext_t *)device_properties.pNext;
+        if (ipVersionExt->ipVersion >= ipVersion) {
+          return UR_RESULT_SUCCESS;
+        }
+      }
+    }
+  }
+  return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+}
+
 /**
  * @brief Initializes the platforms by querying Level Zero drivers and devices.
  *
@@ -282,11 +321,13 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
       return;
     }
 
+    uint32_t UserForcedSysManInit = 0;
     // Check if the user has disabled the default L0 Env initialization.
-    const int UrSysManEnvInitEnabled = [] {
+    const int UrSysManEnvInitEnabled = [&UserForcedSysManInit] {
       const char *UrRet = std::getenv("UR_L0_ENABLE_SYSMAN_ENV_DEFAULT");
       if (!UrRet)
         return 1;
+      UserForcedSysManInit &= 1;
       return std::atoi(UrRet);
     }();
 
@@ -419,16 +460,25 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
 #endif
 
     // Check if the user has enabled the default L0 SysMan initialization.
-    const int UrSysmanZesinitEnable = [] {
+    const int UrSysmanZesinitEnable = [&UserForcedSysManInit] {
       const char *UrRet = std::getenv("UR_L0_ENABLE_ZESINIT_DEFAULT");
       if (!UrRet)
         return 0;
+      UserForcedSysManInit &= 2;
       return std::atoi(UrRet);
     }();
 
-    // Enable zesInit by default only if ZES_ENABLE_SYSMAN has not been set by
-    // default and UrSysmanZesinitEnable is true.
-    if (UrSysmanZesinitEnable && !UrSysManEnvInitEnabled) {
+    bool ZesInitNeeded = UrSysmanZesinitEnable && !UrSysManEnvInitEnabled;
+    // Unless the user has forced the SysMan init, we will check the device
+    // version to see if the zesInit is needed.
+    if (UserForcedSysManInit == 0 &&
+        checkDeviceIntelGPUIpVersionOrNewer(0x05004000) == UR_RESULT_SUCCESS) {
+      if (UrSysManEnvInitEnabled) {
+        setEnvVar("ZES_ENABLE_SYSMAN", "0");
+      }
+      ZesInitNeeded = true;
+    }
+    if (ZesInitNeeded) {
       GlobalAdapter->getDeviceByUUIdFunctionPtr =
           (zes_pfnDriverGetDeviceByUuidExp_t)
               ur_loader::LibLoader::getFunctionPtr(

From a4bc0e63a18ccd240359b36e0585e8badaf4b2a3 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Tue, 3 Dec 2024 07:04:05 +0100
Subject: [PATCH 042/148] fix 2 pvc machine

---
 source/loader/layers/sanitizer/asan/asan_interceptor.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 589e449869..569010ca35 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -457,10 +457,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
                 Device, Program, kSPIR_AsanSpirKernelMetadata, &MetadataSize,
                 &MetadataPtr);
         if (Result != UR_RESULT_SUCCESS) {
-            getContext()->logger.error(
-                "Can't get the pointer of <{}> under device {}: {}",
-                kSPIR_AsanSpirKernelMetadata, (void *)Device, Result);
-            return Result;
+            continue;
         }
 
         const uint64_t NumOfSpirKernel = MetadataSize / sizeof(SpirKernelInfo);

From 3ff07309e1ce2055b8a2a2133ea55594e55ea6ef Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Mon, 2 Dec 2024 11:38:04 +0000
Subject: [PATCH 043/148] Add UR_ADAPTER_INFO_VERSION query

In order to support in-code management of known failures we need a query
to differentiate between the two versions of the Level Zero adapter.
This patch adds the `UR_ADAPTER_INFO_VERSION` query which returns a
`uint32_t` which is set to return `1` for all adapters except the Level
Zero adapter when being compiled in V2 mode.
---
 include/ur_api.h                              |  5 ++++-
 include/ur_print.hpp                          | 15 +++++++++++++++
 scripts/core/adapter.yml                      |  5 +++++
 source/adapters/cuda/adapter.cpp              |  2 ++
 source/adapters/hip/adapter.cpp               |  2 ++
 source/adapters/level_zero/adapter.cpp        |  8 ++++++++
 source/adapters/native_cpu/adapter.cpp        |  2 ++
 source/adapters/opencl/adapter.cpp            |  2 ++
 source/loader/layers/validation/ur_valddi.cpp |  2 +-
 source/loader/ur_libapi.cpp                   |  2 +-
 source/ur_api.cpp                             |  2 +-
 test/conformance/adapter/urAdapterGetInfo.cpp |  4 +++-
 tools/urinfo/urinfo.hpp                       |  2 ++
 13 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..9890291e36 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -967,6 +967,9 @@ typedef enum ur_adapter_info_t {
                                          ///< The reference count returned should be considered immediately stale.
                                          ///< It is unsuitable for general use in applications. This feature is
                                          ///< provided for identifying memory leaks.
+    UR_ADAPTER_INFO_VERSION = 2,         ///< [uint32_t] Specifies the adapter version, initial value of 1 and
+                                         ///< incremented unpon major changes, e.g. when multiple versions of an
+                                         ///< adapter may exist in parallel.
     /// @cond
     UR_ADAPTER_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -988,7 +991,7 @@ typedef enum ur_adapter_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..cdc8a36243 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -1922,6 +1922,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_adapter_info_t value)
     case UR_ADAPTER_INFO_REFERENCE_COUNT:
         os << "UR_ADAPTER_INFO_REFERENCE_COUNT";
         break;
+    case UR_ADAPTER_INFO_VERSION:
+        os << "UR_ADAPTER_INFO_VERSION";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -1962,6 +1965,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_adapter_inf
 
         os << ")";
     } break;
+    case UR_ADAPTER_INFO_VERSION: {
+        const uint32_t *tptr = (const uint32_t *)ptr;
+        if (sizeof(uint32_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/scripts/core/adapter.yml b/scripts/core/adapter.yml
index a4eddd823c..4fc9a104ed 100644
--- a/scripts/core/adapter.yml
+++ b/scripts/core/adapter.yml
@@ -136,6 +136,11 @@ etors:
             [uint32_t] Reference count of the adapter.
             The reference count returned should be considered immediately stale.
             It is unsuitable for general use in applications. This feature is provided for identifying memory leaks.
+    - name: VERSION
+      desc: >
+          [uint32_t] Specifies the adapter version, initial value of 1 and
+          incremented unpon major changes, e.g. when multiple versions of an
+          adapter may exist in parallel.
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Retrieves information about the adapter"
diff --git a/source/adapters/cuda/adapter.cpp b/source/adapters/cuda/adapter.cpp
index c8949cd9a8..49bb964f8e 100644
--- a/source/adapters/cuda/adapter.cpp
+++ b/source/adapters/cuda/adapter.cpp
@@ -108,6 +108,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_CUDA);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/hip/adapter.cpp b/source/adapters/hip/adapter.cpp
index 99db21695f..1bfe498bf6 100644
--- a/source/adapters/hip/adapter.cpp
+++ b/source/adapters/hip/adapter.cpp
@@ -96,6 +96,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_HIP);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index f2ac67ddff..5d6583f3d4 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -655,6 +655,14 @@ ur_result_t urAdapterGetInfo(ur_adapter_handle_t, ur_adapter_info_t PropName,
     return ReturnValue(UR_ADAPTER_BACKEND_LEVEL_ZERO);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(GlobalAdapter->RefCount.load());
+  case UR_ADAPTER_INFO_VERSION: {
+#ifdef UR_ADAPTER_LEVEL_ZERO_V2
+    uint32_t adapterVersion = 2;
+#else
+    uint32_t adapterVersion = 1;
+#endif
+    return ReturnValue(adapterVersion);
+  }
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/native_cpu/adapter.cpp b/source/adapters/native_cpu/adapter.cpp
index 2b5b95ccd0..727c3e3dba 100644
--- a/source/adapters/native_cpu/adapter.cpp
+++ b/source/adapters/native_cpu/adapter.cpp
@@ -57,6 +57,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_NATIVE_CPU);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(Adapter.RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp
index bf81f6bdaf..162bc59b6a 100644
--- a/source/adapters/opencl/adapter.cpp
+++ b/source/adapters/opencl/adapter.cpp
@@ -128,6 +128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
     return ReturnValue(UR_ADAPTER_BACKEND_OPENCL);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(adapter->RefCount.load());
+  case UR_ADAPTER_INFO_VERSION:
+    return ReturnValue(uint32_t{1});
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..ebdb4b5384 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -182,7 +182,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_ADAPTER_INFO_REFERENCE_COUNT < propName) {
+        if (UR_ADAPTER_INFO_VERSION < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..898f259e0b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -451,7 +451,7 @@ ur_result_t UR_APICALL urAdapterGetLastError(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..4a57f5467d 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -410,7 +410,7 @@ ur_result_t UR_APICALL urAdapterGetLastError(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_ADAPTER_INFO_REFERENCE_COUNT < propName`
+///         + `::UR_ADAPTER_INFO_VERSION < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp
index 4dff3ce4dc..63c3cbfca1 100644
--- a/test/conformance/adapter/urAdapterGetInfo.cpp
+++ b/test/conformance/adapter/urAdapterGetInfo.cpp
@@ -20,12 +20,14 @@ struct urAdapterGetInfoTest : uur::runtime::urAdapterTest,
 
 std::unordered_map<ur_adapter_info_t, size_t> adapter_info_size_map = {
     {UR_ADAPTER_INFO_BACKEND, sizeof(ur_adapter_backend_t)},
+    {UR_ADAPTER_INFO_VERSION, sizeof(uint32_t)},
     {UR_ADAPTER_INFO_REFERENCE_COUNT, sizeof(uint32_t)},
 };
 
 INSTANTIATE_TEST_SUITE_P(
     urAdapterGetInfo, urAdapterGetInfoTest,
-    ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_REFERENCE_COUNT),
+    ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_VERSION,
+                      UR_ADAPTER_INFO_REFERENCE_COUNT),
     [](const ::testing::TestParamInfo<ur_adapter_info_t> &info) {
         std::stringstream ss;
         ss << info.param;
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 37c7a80328..813ca34da1 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -28,6 +28,8 @@ inline void printAdapterInfos(ur_adapter_handle_t hAdapter,
                               std::string_view prefix = "  ") {
     std::cout << prefix;
     printAdapterInfo<ur_adapter_backend_t>(hAdapter, UR_ADAPTER_INFO_BACKEND);
+    std::cout << prefix;
+    printAdapterInfo<uint32_t>(hAdapter, UR_ADAPTER_INFO_VERSION);
 }
 
 inline void printPlatformInfos(ur_platform_handle_t hPlatform,

From 6390f6370a577f036b430a7ce447c554679354e1 Mon Sep 17 00:00:00 2001
From: "Gainullin, Artur" <artur.gainullin@intel.com>
Date: Tue, 26 Nov 2024 16:55:27 -0800
Subject: [PATCH 044/148] [L0] Set exec info for all L0 kernels in UR kernel

Currently at urKernelSetExecInfo call a flag is set only
for a single L0 kernel handle associated with one of the devices.
In multi-device case there might be multiple handles and we have to set
the flag for all of them.

Currently this issue causes SYCL "free function kernel" tests to fail
in multi-device env because indirect access flag is not set for some of the modules.
---
 source/adapters/level_zero/kernel.cpp | 55 ++++++++++++++-------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index b98b7e701b..153d3861b1 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -947,35 +947,36 @@ ur_result_t urKernelSetExecInfo(
   std::ignore = PropSize;
   std::ignore = Properties;
 
-  auto ZeKernel = Kernel->ZeKernel;
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
-      *(static_cast<const ur_bool_t *>(PropValue)) == true) {
-    // The whole point for users really was to not need to know anything
-    // about the types of allocations kernel uses. So in DPC++ we always
-    // just set all 3 modes for each kernel.
-    ze_kernel_indirect_access_flags_t IndirectFlags =
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST |
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
-    ZE2UR_CALL(zeKernelSetIndirectAccess, (ZeKernel, IndirectFlags));
-  } else if (PropName == UR_KERNEL_EXEC_INFO_CACHE_CONFIG) {
-    ze_cache_config_flag_t ZeCacheConfig{};
-    auto CacheConfig =
-        *(static_cast<const ur_kernel_cache_config_t *>(PropValue));
-    if (CacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM)
-      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM;
-    else if (CacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA)
-      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA;
-    else if (CacheConfig == UR_KERNEL_CACHE_CONFIG_DEFAULT)
-      ZeCacheConfig = static_cast<ze_cache_config_flag_t>(0);
-    else
-      // Unexpected cache configuration value.
+  for (auto &ZeKernel : Kernel->ZeKernels) {
+    if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
+        *(static_cast<const ur_bool_t *>(PropValue)) == true) {
+      // The whole point for users really was to not need to know anything
+      // about the types of allocations kernel uses. So in DPC++ we always
+      // just set all 3 modes for each kernel.
+      ze_kernel_indirect_access_flags_t IndirectFlags =
+          ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST |
+          ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
+          ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+      ZE2UR_CALL(zeKernelSetIndirectAccess, (ZeKernel, IndirectFlags));
+    } else if (PropName == UR_KERNEL_EXEC_INFO_CACHE_CONFIG) {
+      ze_cache_config_flag_t ZeCacheConfig{};
+      auto CacheConfig =
+          *(static_cast<const ur_kernel_cache_config_t *>(PropValue));
+      if (CacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM)
+        ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM;
+      else if (CacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA)
+        ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA;
+      else if (CacheConfig == UR_KERNEL_CACHE_CONFIG_DEFAULT)
+        ZeCacheConfig = static_cast<ze_cache_config_flag_t>(0);
+      else
+        // Unexpected cache configuration value.
+        return UR_RESULT_ERROR_INVALID_VALUE;
+      ZE2UR_CALL(zeKernelSetCacheConfig, (ZeKernel, ZeCacheConfig););
+    } else {
+      logger::error("urKernelSetExecInfo: unsupported ParamName");
       return UR_RESULT_ERROR_INVALID_VALUE;
-    ZE2UR_CALL(zeKernelSetCacheConfig, (ZeKernel, ZeCacheConfig););
-  } else {
-    logger::error("urKernelSetExecInfo: unsupported ParamName");
-    return UR_RESULT_ERROR_INVALID_VALUE;
+    }
   }
 
   return UR_RESULT_SUCCESS;

From 1433f042f17d2a3f1986bf7b677cae3e0bfba5a0 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 2 Jul 2024 18:02:46 +0100
Subject: [PATCH 045/148] Add initial spec for tensor map APIs

---
 include/ur_api.h                              | 242 ++++++
 include/ur_ddi.h                              |  66 ++
 include/ur_print.h                            |  56 ++
 include/ur_print.hpp                          | 717 ++++++++++++++++++
 scripts/core/EXP-TENSOR-MAP.rst               |  69 ++
 scripts/core/exp-tensor-map.yml               | 207 +++++
 scripts/core/registry.yml                     |   6 +
 source/adapters/adapter.def.in                |   1 +
 source/adapters/adapter.map.in                |   1 +
 source/adapters/mock/ur_mockddi.cpp           | 198 +++++
 source/loader/layers/tracing/ur_trcddi.cpp    | 194 +++++
 source/loader/layers/validation/ur_valddi.cpp | 259 +++++++
 source/loader/loader.def.in                   |  10 +
 source/loader/loader.map.in                   |  10 +
 source/loader/ur_ldrddi.cpp                   | 200 +++++
 source/loader/ur_ldrddi.hpp                   |   5 +
 source/loader/ur_libapi.cpp                   | 148 ++++
 source/loader/ur_libddi.cpp                   |   5 +
 source/loader/ur_print.cpp                    |  59 ++
 source/ur_api.cpp                             | 127 ++++
 20 files changed, 2580 insertions(+)
 create mode 100644 scripts/core/EXP-TENSOR-MAP.rst
 create mode 100644 scripts/core/exp-tensor-map.yml

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..13334a9c8e 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -231,6 +231,8 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP = 244,              ///< Enumerator for ::urCommandBufferUpdateWaitEventsExp
     UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 245,     ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp
     UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT = 246,               ///< Enumerator for ::urEnqueueEventsWaitWithBarrierExt
+    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 230,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 231,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -10161,6 +10163,203 @@ urEnqueueNativeCommandExp(
                                                                    ///< not NULL, phEvent must not refer to an element of the phEventWaitList array.
 );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs
+#if !defined(__GNUC__)
+#pragma region tensor map(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of tensor map object
+typedef struct ur_exp_tensor_map_handle_t_ *ur_exp_tensor_map_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tensor map data type
+typedef uint32_t ur_exp_tensor_map_data_type_flags_t;
+typedef enum ur_exp_tensor_map_data_type_flag_t {
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8 = UR_BIT(0),         ///< 1 byte
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16 = UR_BIT(1),        ///< 2 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32 = UR_BIT(2),        ///< 4 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32 = UR_BIT(3),         ///< 4 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64 = UR_BIT(4),        ///< 8 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64 = UR_BIT(5),         ///< 8 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16 = UR_BIT(6),       ///< 2 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32 = UR_BIT(7),       ///< 4 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64 = UR_BIT(8),       ///< 8 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16 = UR_BIT(9),      ///< 2 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ = UR_BIT(10),  ///< 4 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32 = UR_BIT(11),     ///< 4 bytes
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ = UR_BIT(12), ///< 4 bytes
+    /// @cond
+    UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_tensor_map_data_type_flag_t;
+/// @brief Bit Mask for validating ur_exp_tensor_map_data_type_flags_t
+#define UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK 0xffffe000
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tensor map interleave
+typedef uint32_t ur_exp_tensor_map_interleave_flags_t;
+typedef enum ur_exp_tensor_map_interleave_flag_t {
+    UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE = UR_BIT(0), ///< No interleave
+    UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B = UR_BIT(1),  ///< 16B interleave
+    UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B = UR_BIT(2),  ///< 32B interleave
+    /// @cond
+    UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_tensor_map_interleave_flag_t;
+/// @brief Bit Mask for validating ur_exp_tensor_map_interleave_flags_t
+#define UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK 0xfffffff8
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tensor map l2 promotion
+typedef uint32_t ur_exp_tensor_map_l2_promotion_flags_t;
+typedef enum ur_exp_tensor_map_l2_promotion_flag_t {
+    UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE = UR_BIT(0), ///< No promotion type
+    UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B = UR_BIT(1),  ///< 64B promotion type
+    UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B = UR_BIT(2), ///< 128B promotion type
+    UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B = UR_BIT(3), ///< 256B promotion type
+    /// @cond
+    UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_tensor_map_l2_promotion_flag_t;
+/// @brief Bit Mask for validating ur_exp_tensor_map_l2_promotion_flags_t
+#define UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK 0xfffffff0
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tensor map swizzle
+typedef uint32_t ur_exp_tensor_map_swizzle_flags_t;
+typedef enum ur_exp_tensor_map_swizzle_flag_t {
+    UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE = UR_BIT(0), ///< No swizzle
+    UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B = UR_BIT(1),  ///< 32B swizzle
+    UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B = UR_BIT(2),  ///< 64B swizzle
+    UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B = UR_BIT(3), ///< 128B swizzle
+    /// @cond
+    UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_tensor_map_swizzle_flag_t;
+/// @brief Bit Mask for validating ur_exp_tensor_map_swizzle_flags_t
+#define UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK 0xfffffff0
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tensor map OOB fill
+typedef uint32_t ur_exp_tensor_map_oob_fill_flags_t;
+typedef enum ur_exp_tensor_map_oob_fill_flag_t {
+    UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE = UR_BIT(0),             ///< No OOB fill
+    UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA = UR_BIT(1), ///< Refer to NVIDIA docs
+    /// @cond
+    UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_exp_tensor_map_oob_fill_flag_t;
+/// @brief Bit Mask for validating ur_exp_tensor_map_oob_fill_flags_t
+#define UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK 0xfffffffc
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with image data
+///
+/// @details
+///     - Map encode using im2col.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == PixelBoxLowerCorner`
+///         + `NULL == PixelBoxUpperCorner`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+UR_APIEXPORT ur_result_t UR_APICALL
+urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice,                         ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t TensorMapType,  ///< [in] Data type of the tensor object.
+    uint32_t TensorRank,                                ///< [in] Dimensionality of tensor; must be at least 3.
+    void *GlobalAddress,                                ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *GlobalDim,                          ///< [in] Array containing tensor size (number of elements) along each of
+                                                        ///< the TensorRank dimensions.
+    const uint64_t *GlobalStrides,                      ///< [in] Array containing stride size (in bytes) along each of the
+                                                        ///< tensorRank - 1 dimensions.
+    const int *PixelBoxLowerCorner,                     ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *PixelBoxUpperCorner,                     ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel,                          ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,                           ///< [in] Number of pixels per column.
+    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the tensorRank
+                                                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t Interleave,    ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t Swizzle,          ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicate whether zero or special NaN constant will be used to
+                                                        ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t *hTensorMap              ///< [out] Handle of the tensor map object.
+);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with tiled data
+///
+/// @details
+///     - Tiled map encode.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == BoxDim`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+UR_APIEXPORT ur_result_t UR_APICALL
+urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice,                         ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t TensorMapType,  ///< [in] Data type of the tensor object.
+    uint32_t TensorRank,                                ///< [in] Dimensionality of tensor; must be at least 3.
+    void *GlobalAddress,                                ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *GlobalDim,                          ///< [in] Array containing tensor size (number of elements) along each of
+                                                        ///< the TensorRank dimensions.
+    const uint64_t *GlobalStrides,                      ///< [in] Array containing stride size (in bytes) along each of the
+                                                        ///< tensorRank - 1 dimensions.
+    const uint32_t *BoxDim,                             ///< [in] Array containing traversal box size (number of elments) along
+                                                        ///< each of the tensorRank dimensions. Specifies how many elements to be
+                                                        ///< traversed along each tensor dimension.
+    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the tensorRank
+                                                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t Interleave,    ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t Swizzle,          ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicate whether zero or special NaN constant will be used to
+                                                        ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t *hTensorMap              ///< [out] Handle of the tensor map object.
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -12333,6 +12532,49 @@ typedef struct ur_command_buffer_command_get_info_exp_params_t {
     size_t **ppPropSizeRet;
 } ur_command_buffer_command_get_info_exp_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urTensorMapEncodeIm2ColExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_tensor_map_encode_im_2_col_exp_params_t {
+    ur_device_handle_t *phDevice;
+    ur_exp_tensor_map_data_type_flags_t *pTensorMapType;
+    uint32_t *pTensorRank;
+    void **pGlobalAddress;
+    const uint64_t **pGlobalDim;
+    const uint64_t **pGlobalStrides;
+    const int **pPixelBoxLowerCorner;
+    const int **pPixelBoxUpperCorner;
+    uint32_t *pChannelsPerPixel;
+    uint32_t *pPixelsPerColumn;
+    const uint32_t **pElementStrides;
+    ur_exp_tensor_map_interleave_flags_t *pInterleave;
+    ur_exp_tensor_map_swizzle_flags_t *pSwizzle;
+    ur_exp_tensor_map_l2_promotion_flags_t *pL2Promotion;
+    ur_exp_tensor_map_oob_fill_flags_t *pOobFill;
+    ur_exp_tensor_map_handle_t **phTensorMap;
+} ur_tensor_map_encode_im_2_col_exp_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urTensorMapEncodeTiledExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_tensor_map_encode_tiled_exp_params_t {
+    ur_device_handle_t *phDevice;
+    ur_exp_tensor_map_data_type_flags_t *pTensorMapType;
+    uint32_t *pTensorRank;
+    void **pGlobalAddress;
+    const uint64_t **pGlobalDim;
+    const uint64_t **pGlobalStrides;
+    const uint32_t **pBoxDim;
+    const uint32_t **pElementStrides;
+    ur_exp_tensor_map_interleave_flags_t *pInterleave;
+    ur_exp_tensor_map_swizzle_flags_t *pSwizzle;
+    ur_exp_tensor_map_l2_promotion_flags_t *pL2Promotion;
+    ur_exp_tensor_map_oob_fill_flags_t *pOobFill;
+    ur_exp_tensor_map_handle_t **phTensorMap;
+} ur_tensor_map_encode_tiled_exp_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urUsmP2PEnablePeerAccessExp
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 40a6c5c269..695c1885b0 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -2248,6 +2248,71 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetCommandBufferExpProcAddrTable_t)(
     ur_api_version_t,
     ur_command_buffer_exp_dditable_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urTensorMapEncodeIm2ColExp
+typedef ur_result_t(UR_APICALL *ur_pfnTensorMapEncodeIm2ColExp_t)(
+    ur_device_handle_t,
+    ur_exp_tensor_map_data_type_flags_t,
+    uint32_t,
+    void *,
+    const uint64_t *,
+    const uint64_t *,
+    const int *,
+    const int *,
+    uint32_t,
+    uint32_t,
+    const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urTensorMapEncodeTiledExp
+typedef ur_result_t(UR_APICALL *ur_pfnTensorMapEncodeTiledExp_t)(
+    ur_device_handle_t,
+    ur_exp_tensor_map_data_type_flags_t,
+    uint32_t,
+    void *,
+    const uint64_t *,
+    const uint64_t *,
+    const uint32_t *,
+    const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of TensorMapExp functions pointers
+typedef struct ur_tensor_map_exp_dditable_t {
+    ur_pfnTensorMapEncodeIm2ColExp_t pfnEncodeIm2ColExp;
+    ur_pfnTensorMapEncodeTiledExp_t pfnEncodeTiledExp;
+} ur_tensor_map_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TensorMapExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version,               ///< [in] API version requested
+    ur_tensor_map_exp_dditable_t *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urGetTensorMapExpProcAddrTable
+typedef ur_result_t(UR_APICALL *ur_pfnGetTensorMapExpProcAddrTable_t)(
+    ur_api_version_t,
+    ur_tensor_map_exp_dditable_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urUsmP2PEnablePeerAccessExp
 typedef ur_result_t(UR_APICALL *ur_pfnUsmP2PEnablePeerAccessExp_t)(
@@ -2515,6 +2580,7 @@ typedef struct ur_dditable_t {
     ur_usm_dditable_t USM;
     ur_usm_exp_dditable_t USMExp;
     ur_command_buffer_exp_dditable_t CommandBufferExp;
+    ur_tensor_map_exp_dditable_t TensorMapExp;
     ur_usm_p2p_exp_dditable_t UsmP2PExp;
     ur_virtual_mem_dditable_t VirtualMem;
     ur_device_dditable_t Device;
diff --git a/include/ur_print.h b/include/ur_print.h
index c2adb18067..3782ffb5ce 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1098,6 +1098,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandFlags(enum ur_
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandProperties(const struct ur_exp_enqueue_native_command_properties_t params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_data_type_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpTensorMapDataTypeFlags(enum ur_exp_tensor_map_data_type_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_interleave_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpTensorMapInterleaveFlags(enum ur_exp_tensor_map_interleave_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_l2_promotion_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpTensorMapL2PromotionFlags(enum ur_exp_tensor_map_l2_promotion_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_swizzle_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpTensorMapSwizzleFlags(enum ur_exp_tensor_map_swizzle_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_oob_fill_flag_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpTensorMapOobFillFlags(enum ur_exp_tensor_map_oob_fill_flag_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_loader_config_create_params_t struct
 /// @returns
@@ -2522,6 +2562,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferGetInfoExpParams(const s
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferCommandGetInfoExpParams(const struct ur_command_buffer_command_get_info_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_tensor_map_encode_im_2_col_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintTensorMapEncodeIm_2ColExpParams(const struct ur_tensor_map_encode_im_2_col_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_tensor_map_encode_tiled_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintTensorMapEncodeTiledExpParams(const struct ur_tensor_map_encode_tiled_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_usm_p2p_enable_peer_access_exp_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..dafe882726 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -56,6 +56,8 @@ template <>
 struct is_handle<ur_exp_command_buffer_handle_t> : std::true_type {};
 template <>
 struct is_handle<ur_exp_command_buffer_command_handle_t> : std::true_type {};
+template <>
+struct is_handle<ur_exp_tensor_map_handle_t> : std::true_type {};
 template <typename T>
 inline constexpr bool is_handle_v = is_handle<T>::value;
 template <typename T>
@@ -222,6 +224,21 @@ inline ur_result_t printFlag<ur_exp_enqueue_ext_flag_t>(std::ostream &os, uint32
 template <>
 inline ur_result_t printFlag<ur_exp_enqueue_native_command_flag_t>(std::ostream &os, uint32_t flag);
 
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_data_type_flag_t>(std::ostream &os, uint32_t flag);
+
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_interleave_flag_t>(std::ostream &os, uint32_t flag);
+
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_l2_promotion_flag_t>(std::ostream &os, uint32_t flag);
+
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_swizzle_flag_t>(std::ostream &os, uint32_t flag);
+
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_oob_fill_flag_t>(std::ostream &os, uint32_t flag);
+
 } // namespace ur::details
 
 inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value);
@@ -359,6 +376,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_ext_flag_t
 inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_enqueue_ext_properties_t params);
 inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_native_command_flag_t value);
 inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_enqueue_native_command_properties_t params);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_data_type_flag_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_interleave_flag_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_l2_promotion_flag_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_swizzle_flag_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_oob_fill_flag_t value);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_function_t type
@@ -965,6 +987,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
         break;
     case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT:
         os << "UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT";
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
+        break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
         break;
     default:
         os << "unknown enumerator";
@@ -10662,6 +10689,504 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_enqueue_na
     os << "}";
     return os;
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_tensor_map_data_type_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_data_type_flag_t value) {
+    switch (value) {
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32";
+        break;
+    case UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ:
+        os << "UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_data_type_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_data_type_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ) == (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_tensor_map_interleave_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_interleave_flag_t value) {
+    switch (value) {
+    case UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE:
+        os << "UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE";
+        break;
+    case UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B:
+        os << "UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B";
+        break;
+    case UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B:
+        os << "UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_interleave_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_interleave_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE) == (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B) == (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B) == (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_tensor_map_l2_promotion_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_l2_promotion_flag_t value) {
+    switch (value) {
+    case UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE:
+        os << "UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE";
+        break;
+    case UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B:
+        os << "UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B";
+        break;
+    case UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B:
+        os << "UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B";
+        break;
+    case UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B:
+        os << "UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_l2_promotion_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_l2_promotion_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE) == (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B) == (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B) == (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B) == (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_tensor_map_swizzle_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_swizzle_flag_t value) {
+    switch (value) {
+    case UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE:
+        os << "UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE";
+        break;
+    case UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B:
+        os << "UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B";
+        break;
+    case UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B:
+        os << "UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B";
+        break;
+    case UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B:
+        os << "UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_swizzle_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_swizzle_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE) == (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B) == (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B) == (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B) == (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_tensor_map_oob_fill_flag_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_exp_tensor_map_oob_fill_flag_t value) {
+    switch (value) {
+    case UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE:
+        os << "UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE";
+        break;
+    case UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA:
+        os << "UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_tensor_map_oob_fill_flag_t flag
+template <>
+inline ur_result_t printFlag<ur_exp_tensor_map_oob_fill_flag_t>(std::ostream &os, uint32_t flag) {
+    uint32_t val = flag;
+    bool first = true;
+
+    if ((val & UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE) == (uint32_t)UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE;
+    }
+
+    if ((val & UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA) == (uint32_t)UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA) {
+        val ^= (uint32_t)UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA;
+        if (!first) {
+            os << " | ";
+        } else {
+            first = false;
+        }
+        os << UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA;
+    }
+    if (val != 0) {
+        std::bitset<32> bits(val);
+        if (!first) {
+            os << " | ";
+        }
+        os << "unknown bit flags " << bits;
+    } else if (first) {
+        os << "0";
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_loader_config_create_params_t type
@@ -17932,6 +18457,192 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_tensor_map_encode_im_2_col_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_tensor_map_encode_im_2_col_exp_params_t *params) {
+
+    os << ".hDevice = ";
+
+    ur::details::printPtr(os,
+                          *(params->phDevice));
+
+    os << ", ";
+    os << ".TensorMapType = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_data_type_flag_t>(os,
+                                                               *(params->pTensorMapType));
+
+    os << ", ";
+    os << ".TensorRank = ";
+
+    os << *(params->pTensorRank);
+
+    os << ", ";
+    os << ".GlobalAddress = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalAddress));
+
+    os << ", ";
+    os << ".GlobalDim = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalDim));
+
+    os << ", ";
+    os << ".GlobalStrides = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalStrides));
+
+    os << ", ";
+    os << ".PixelBoxLowerCorner = ";
+
+    ur::details::printPtr(os,
+                          *(params->pPixelBoxLowerCorner));
+
+    os << ", ";
+    os << ".PixelBoxUpperCorner = ";
+
+    ur::details::printPtr(os,
+                          *(params->pPixelBoxUpperCorner));
+
+    os << ", ";
+    os << ".ChannelsPerPixel = ";
+
+    os << *(params->pChannelsPerPixel);
+
+    os << ", ";
+    os << ".PixelsPerColumn = ";
+
+    os << *(params->pPixelsPerColumn);
+
+    os << ", ";
+    os << ".ElementStrides = ";
+
+    ur::details::printPtr(os,
+                          *(params->pElementStrides));
+
+    os << ", ";
+    os << ".Interleave = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_interleave_flag_t>(os,
+                                                                *(params->pInterleave));
+
+    os << ", ";
+    os << ".Swizzle = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_swizzle_flag_t>(os,
+                                                             *(params->pSwizzle));
+
+    os << ", ";
+    os << ".L2Promotion = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_l2_promotion_flag_t>(os,
+                                                                  *(params->pL2Promotion));
+
+    os << ", ";
+    os << ".OobFill = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_oob_fill_flag_t>(os,
+                                                              *(params->pOobFill));
+
+    os << ", ";
+    os << ".hTensorMap = ";
+
+    ur::details::printPtr(os,
+                          *(params->phTensorMap));
+
+    return os;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_tensor_map_encode_tiled_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_tensor_map_encode_tiled_exp_params_t *params) {
+
+    os << ".hDevice = ";
+
+    ur::details::printPtr(os,
+                          *(params->phDevice));
+
+    os << ", ";
+    os << ".TensorMapType = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_data_type_flag_t>(os,
+                                                               *(params->pTensorMapType));
+
+    os << ", ";
+    os << ".TensorRank = ";
+
+    os << *(params->pTensorRank);
+
+    os << ", ";
+    os << ".GlobalAddress = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalAddress));
+
+    os << ", ";
+    os << ".GlobalDim = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalDim));
+
+    os << ", ";
+    os << ".GlobalStrides = ";
+
+    ur::details::printPtr(os,
+                          *(params->pGlobalStrides));
+
+    os << ", ";
+    os << ".BoxDim = ";
+
+    ur::details::printPtr(os,
+                          *(params->pBoxDim));
+
+    os << ", ";
+    os << ".ElementStrides = ";
+
+    ur::details::printPtr(os,
+                          *(params->pElementStrides));
+
+    os << ", ";
+    os << ".Interleave = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_interleave_flag_t>(os,
+                                                                *(params->pInterleave));
+
+    os << ", ";
+    os << ".Swizzle = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_swizzle_flag_t>(os,
+                                                             *(params->pSwizzle));
+
+    os << ", ";
+    os << ".L2Promotion = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_l2_promotion_flag_t>(os,
+                                                                  *(params->pL2Promotion));
+
+    os << ", ";
+    os << ".OobFill = ";
+
+    ur::details::printFlag<ur_exp_tensor_map_oob_fill_flag_t>(os,
+                                                              *(params->pOobFill));
+
+    os << ", ";
+    os << ".hTensorMap = ";
+
+    ur::details::printPtr(os,
+                          *(params->phTensorMap));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_usm_p2p_enable_peer_access_exp_params_t type
 /// @returns
@@ -19170,6 +19881,12 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP: {
         os << (const struct ur_command_buffer_command_get_info_exp_params_t *)params;
     } break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP: {
+        os << (const struct ur_tensor_map_encode_im_2_col_exp_params_t *)params;
+    } break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP: {
+        os << (const struct ur_tensor_map_encode_tiled_exp_params_t *)params;
+    } break;
     case UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP: {
         os << (const struct ur_usm_p2p_enable_peer_access_exp_params_t *)params;
     } break;
diff --git a/scripts/core/EXP-TENSOR-MAP.rst b/scripts/core/EXP-TENSOR-MAP.rst
new file mode 100644
index 0000000000..3679f3cfd1
--- /dev/null
+++ b/scripts/core/EXP-TENSOR-MAP.rst
@@ -0,0 +1,69 @@
+<%
+    OneApi=tags['$OneApi']
+    x=tags['$x']
+    X=x.upper()
+%>
+
+.. _experimental-enqueue-native-command:
+
+================================================================================
+Tensor Mapping APIs
+================================================================================
+
+.. warning::
+
+    Experimental features:
+
+    *   May be replaced, updated, or removed at any time.
+    *   Do not require maintaining API/ABI stability of their own additions over
+        time.
+    *   Do not require conformance testing of their own additions.
+
+
+Motivation
+--------------------------------------------------------------------------------
+
+Used to target the CUDA entry points cuTensorMapEncodeIm2col and
+cuTensorMapEncodeTiled.
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+${x}_exp_tensor_map_data_type_flags_t
+${x}_exp_tensor_map_interleave_flags_t
+${x}_exp_tensor_map_l2_promotion_flags_t
+${x}_exp_tensor_map_swizzle_flags_t
+${x}_exp_tensor_map_oob_fill_flags_t
+
+Types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+${x}_exp_tensor_map_handle_t
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}TensorMapEncodeIm2ColExp
+* ${x}TensorMapEncodeTiledExp
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+------------------------+
+| Revision  | Changes                |
++===========+========================+
+| 1.0       | Initial Draft          |
++-----------+------------------------+
+
+
+Support
+--------------------------------------------------------------------------------
+
+This is only supported in the CUDA adapter.
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Hugh Delaney `hugh.delaney@codeplay.com <hugh.delaney@codeplay.com>`_
diff --git a/scripts/core/exp-tensor-map.yml b/scripts/core/exp-tensor-map.yml
new file mode 100644
index 0000000000..258a2403f0
--- /dev/null
+++ b/scripts/core/exp-tensor-map.yml
@@ -0,0 +1,207 @@
+#
+# Copyright (C) 2024 Intel Corporation
+#
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# See YaML.md for syntax definition
+#
+--- #--------------------------------------------------------------------------
+type: header
+desc: "Intel $OneApi Unified Runtime Experimental API for enqueuing work through native APIs"
+ordinal: "100"
+
+--- #--------------------------------------------------------------------------
+type: handle
+desc: "Handle of tensor map object"
+class: $xTensorMap
+name: "$x_exp_tensor_map_handle_t"
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Tensor map data type"
+name: $x_exp_tensor_map_data_type_flags_t
+etors:
+    - name: UINT8
+      desc: "1 byte"
+    - name: UINT16
+      desc: "2 bytes"
+    - name: UINT32
+      desc: "4 bytes"
+    - name: INT32
+      desc: "4 bytes"
+    - name: UINT64
+      desc: "8 bytes"
+    - name: INT64
+      desc: "8 bytes"
+    - name: FLOAT16
+      desc: "2 bytes"
+    - name: FLOAT32
+      desc: "4 bytes"
+    - name: FLOAT64
+      desc: "8 bytes"
+    - name: BFLOAT16
+      desc: "2 bytes"
+    - name: FLOAT32_FTZ
+      desc: "4 bytes"
+    - name: TFLOAT32
+      desc: "4 bytes"
+    - name: TFLOAT32_FTZ
+      desc: "4 bytes"
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Tensor map interleave"
+name: $x_exp_tensor_map_interleave_flags_t
+etors:
+    - name: NONE
+      desc: "No interleave"
+    - name: 16B
+      desc: "16B interleave"
+    - name: 32B
+      desc: "32B interleave"
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Tensor map l2 promotion"
+name: $x_exp_tensor_map_l2_promotion_flags_t
+etors:
+    - name: NONE
+      desc: "No promotion type"
+    - name: 64B
+      desc: "64B promotion type"
+    - name: 128B
+      desc: "128B promotion type"
+    - name: 256B
+      desc: "256B promotion type"
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Tensor map swizzle"
+name: $x_exp_tensor_map_swizzle_flags_t
+etors:
+    - name: NONE
+      desc: "No swizzle"
+    - name: 32B
+      desc: "32B swizzle"
+    - name: 64B
+      desc: "64B swizzle"
+    - name: 128B
+      desc: "128B swizzle"
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Tensor map OOB fill"
+name: $x_exp_tensor_map_oob_fill_flags_t
+etors:
+    - name: NONE
+      desc: "No OOB fill"
+    - name: REQUEST_ZERO_FMA
+      desc: "Refer to NVIDIA docs"
+
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Encode tensor map with image data"
+class: $xTensorMap
+name: EncodeIm2ColExp
+details:
+    - "Map encode using im2col."
+params:
+    - type: $x_device_handle_t
+      name: hDevice
+      desc: "[in] Handle of the device object."
+    - type: $x_exp_tensor_map_data_type_flags_t
+      name: TensorMapType
+      desc: "[in] Data type of the tensor object."
+    - type: uint32_t
+      name: TensorRank
+      desc: "[in] Dimensionality of tensor; must be at least 3."
+    - type: void*
+      name: GlobalAddress
+      desc: "[in] Starting address of memory region described by tensor."
+    - type: const uint64_t*
+      name: GlobalDim
+      desc: "[in] Array containing tensor size (number of elements) along each of the TensorRank dimensions."
+    - type: const uint64_t*
+      name: GlobalStrides
+      desc: "[in] Array containing stride size (in bytes) along each of the tensorRank - 1 dimensions."
+    - type: const int*
+      name: PixelBoxLowerCorner
+      desc: "[in] Array containing DHW dimensions of lower box corner."
+    - type: const int*
+      name: PixelBoxUpperCorner
+      desc: "[in] Array containing DHW dimensions of upper box corner."
+    - type: uint32_t
+      name: ChannelsPerPixel
+      desc: "[in] Number of channels per pixel."
+    - type: uint32_t
+      name: PixelsPerColumn
+      desc: "[in] Number of pixels per column."
+    - type: const uint32_t*
+      name: ElementStrides
+      desc: "[in] Array containing traversal stride in each of the tensorRank dimensions."
+    - type: $x_exp_tensor_map_interleave_flags_t
+      name: Interleave
+      desc: "[in] Type of interleaved layout the tensor addresses"
+    - type: $x_exp_tensor_map_swizzle_flags_t
+      name: Swizzle
+      desc: "[in] Bank swizzling pattern inside shared memory"
+    - type: $x_exp_tensor_map_l2_promotion_flags_t
+      name: L2Promotion
+      desc: "[in] L2 promotion size."
+    - type: $x_exp_tensor_map_oob_fill_flags_t
+      name: OobFill
+      desc: "[in] Indicate whether zero or special NaN constant will be used to fill out-of-bound elements."
+    - type: $x_exp_tensor_map_handle_t*
+      name: hTensorMap
+      desc: "[out] Handle of the tensor map object."
+
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Encode tensor map with tiled data"
+class: $xTensorMap
+name: EncodeTiledExp
+details:
+    - "Tiled map encode."
+params:
+    - type: $x_device_handle_t
+      name: hDevice
+      desc: "[in] Handle of the device object."
+    - type: $x_exp_tensor_map_data_type_flags_t
+      name: TensorMapType
+      desc: "[in] Data type of the tensor object."
+    - type: uint32_t
+      name: TensorRank
+      desc: "[in] Dimensionality of tensor; must be at least 3."
+    - type: void*
+      name: GlobalAddress
+      desc: "[in] Starting address of memory region described by tensor."
+    - type: const uint64_t*
+      name: GlobalDim
+      desc: "[in] Array containing tensor size (number of elements) along each of the TensorRank dimensions."
+    - type: const uint64_t*
+      name: GlobalStrides
+      desc: "[in] Array containing stride size (in bytes) along each of the tensorRank - 1 dimensions."
+    - type: const uint32_t*
+      name: BoxDim
+      desc: "[in] Array containing traversal box size (number of elments) along each of the tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension."
+    - type: const uint32_t*
+      name: ElementStrides
+      desc: "[in] Array containing traversal stride in each of the tensorRank dimensions."
+    - type: $x_exp_tensor_map_interleave_flags_t
+      name: Interleave
+      desc: "[in] Type of interleaved layout the tensor addresses"
+    - type: $x_exp_tensor_map_swizzle_flags_t
+      name: Swizzle
+      desc: "[in] Bank swizzling pattern inside shared memory"
+    - type: $x_exp_tensor_map_l2_promotion_flags_t
+      name: L2Promotion
+      desc: "[in] L2 promotion size."
+    - type: $x_exp_tensor_map_oob_fill_flags_t
+      name: OobFill
+      desc: "[in] Indicate whether zero or special NaN constant will be used to fill out-of-bound elements."
+    - type: $x_exp_tensor_map_handle_t*
+      name: hTensorMap
+      desc: "[out] Handle of the tensor map object."
+
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 2133e1c889..059e23c2a0 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -607,6 +607,12 @@ etors:
 - name: ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT
   desc: Enumerator for $xEnqueueEventsWaitWithBarrierExt
   value: '246'
+- name: TENSOR_MAP_ENCODE_IM_2_COL_EXP
+  desc: Enumerator for $xTensorMapEncodeIm2ColExp
+  value: '230'
+- name: TENSOR_MAP_ENCODE_TILED_EXP
+  desc: Enumerator for $xTensorMapEncodeTiledExp
+  value: '231'
 ---
 type: enum
 desc: Defines structure types
diff --git a/source/adapters/adapter.def.in b/source/adapters/adapter.def.in
index 3c18c78bd1..fd37178966 100644
--- a/source/adapters/adapter.def.in
+++ b/source/adapters/adapter.def.in
@@ -16,6 +16,7 @@ EXPORTS
 	urGetProgramExpProcAddrTable
 	urGetQueueProcAddrTable
 	urGetSamplerProcAddrTable
+	urGetTensorMapExpProcAddrTable
 	urGetUSMProcAddrTable
 	urGetUSMExpProcAddrTable
 	urGetUsmP2PExpProcAddrTable
diff --git a/source/adapters/adapter.map.in b/source/adapters/adapter.map.in
index bb08ae7d88..50db54ef40 100644
--- a/source/adapters/adapter.map.in
+++ b/source/adapters/adapter.map.in
@@ -16,6 +16,7 @@
 		urGetProgramExpProcAddrTable;
 		urGetQueueProcAddrTable;
 		urGetSamplerProcAddrTable;
+		urGetTensorMapExpProcAddrTable;
 		urGetUSMProcAddrTable;
 		urGetUSMExpProcAddrTable;
 		urGetUsmP2PExpProcAddrTable;
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 42c342444d..f2849e73ff 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10731,6 +10731,172 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeIm2ColExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    ur_tensor_map_encode_im_2_col_exp_params_t params = {&hDevice,
+                                                         &TensorMapType,
+                                                         &TensorRank,
+                                                         &GlobalAddress,
+                                                         &GlobalDim,
+                                                         &GlobalStrides,
+                                                         &PixelBoxLowerCorner,
+                                                         &PixelBoxUpperCorner,
+                                                         &ChannelsPerPixel,
+                                                         &PixelsPerColumn,
+                                                         &ElementStrides,
+                                                         &Interleave,
+                                                         &Swizzle,
+                                                         &L2Promotion,
+                                                         &OobFill,
+                                                         &hTensorMap};
+
+    auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_before_callback("urTensorMapEncodeIm2ColExp"));
+    if (beforeCallback) {
+        result = beforeCallback(&params);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+    }
+
+    auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_replace_callback(
+            "urTensorMapEncodeIm2ColExp"));
+    if (replaceCallback) {
+        result = replaceCallback(&params);
+    } else {
+
+        *hTensorMap = mock::createDummyHandle<ur_exp_tensor_map_handle_t>();
+        result = UR_RESULT_SUCCESS;
+    }
+
+    if (result != UR_RESULT_SUCCESS) {
+        return result;
+    }
+
+    auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_after_callback("urTensorMapEncodeIm2ColExp"));
+    if (afterCallback) {
+        return afterCallback(&params);
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeTiledExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    ur_tensor_map_encode_tiled_exp_params_t params = {
+        &hDevice,    &TensorMapType, &TensorRank,  &GlobalAddress,
+        &GlobalDim,  &GlobalStrides, &BoxDim,      &ElementStrides,
+        &Interleave, &Swizzle,       &L2Promotion, &OobFill,
+        &hTensorMap};
+
+    auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_before_callback("urTensorMapEncodeTiledExp"));
+    if (beforeCallback) {
+        result = beforeCallback(&params);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+    }
+
+    auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_replace_callback("urTensorMapEncodeTiledExp"));
+    if (replaceCallback) {
+        result = replaceCallback(&params);
+    } else {
+
+        *hTensorMap = mock::createDummyHandle<ur_exp_tensor_map_handle_t>();
+        result = UR_RESULT_SUCCESS;
+    }
+
+    if (result != UR_RESULT_SUCCESS) {
+        return result;
+    }
+
+    auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_after_callback("urTensorMapEncodeTiledExp"));
+    if (afterCallback) {
+        return afterCallback(&params);
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 } // namespace driver
 
 #if defined(__cplusplus)
@@ -11550,6 +11716,38 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TensorMapExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+UR_DLLEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_tensor_map_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+    ) try {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (driver::d_context.version < version) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnEncodeIm2ColExp = driver::urTensorMapEncodeIm2ColExp;
+
+    pDdiTable->pfnEncodeTiledExp = driver::urTensorMapEncodeTiledExp;
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's USM table
 ///        with current process' addresses
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 64489c39ac..5b28fd9f30 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -9221,6 +9221,158 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeIm2ColExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    auto pfnEncodeIm2ColExp =
+        getContext()->urDdiTable.TensorMapExp.pfnEncodeIm2ColExp;
+
+    if (nullptr == pfnEncodeIm2ColExp) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_tensor_map_encode_im_2_col_exp_params_t params = {&hDevice,
+                                                         &TensorMapType,
+                                                         &TensorRank,
+                                                         &GlobalAddress,
+                                                         &GlobalDim,
+                                                         &GlobalStrides,
+                                                         &PixelBoxLowerCorner,
+                                                         &PixelBoxUpperCorner,
+                                                         &ChannelsPerPixel,
+                                                         &PixelsPerColumn,
+                                                         &ElementStrides,
+                                                         &Interleave,
+                                                         &Swizzle,
+                                                         &L2Promotion,
+                                                         &OobFill,
+                                                         &hTensorMap};
+    uint64_t instance =
+        getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP,
+                                   "urTensorMapEncodeIm2ColExp", &params);
+
+    getContext()->logger.info("---> urTensorMapEncodeIm2ColExp");
+
+    ur_result_t result = pfnEncodeIm2ColExp(
+        hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
+        GlobalStrides, PixelBoxLowerCorner, PixelBoxUpperCorner,
+        ChannelsPerPixel, PixelsPerColumn, ElementStrides, Interleave, Swizzle,
+        L2Promotion, OobFill, hTensorMap);
+
+    getContext()->notify_end(UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP,
+                             "urTensorMapEncodeIm2ColExp", &params, &result,
+                             instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, &params);
+    getContext()->logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeTiledExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    auto pfnEncodeTiledExp =
+        getContext()->urDdiTable.TensorMapExp.pfnEncodeTiledExp;
+
+    if (nullptr == pfnEncodeTiledExp) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_tensor_map_encode_tiled_exp_params_t params = {
+        &hDevice,    &TensorMapType, &TensorRank,  &GlobalAddress,
+        &GlobalDim,  &GlobalStrides, &BoxDim,      &ElementStrides,
+        &Interleave, &Swizzle,       &L2Promotion, &OobFill,
+        &hTensorMap};
+    uint64_t instance =
+        getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP,
+                                   "urTensorMapEncodeTiledExp", &params);
+
+    getContext()->logger.info("---> urTensorMapEncodeTiledExp");
+
+    ur_result_t result = pfnEncodeTiledExp(
+        hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
+        GlobalStrides, BoxDim, ElementStrides, Interleave, Swizzle, L2Promotion,
+        OobFill, hTensorMap);
+
+    getContext()->notify_end(UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP,
+                             "urTensorMapEncodeTiledExp", &params, &result,
+                             instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, &params);
+    getContext()->logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Global table
 ///        with current process' addresses
@@ -10266,6 +10418,43 @@ __urdlllocal ur_result_t UR_APICALL urGetSamplerProcAddrTable(
     return result;
 }
 ///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TensorMapExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_tensor_map_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    auto &dditable = ur_tracing_layer::getContext()->urDdiTable.TensorMapExp;
+
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_tracing_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_tracing_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    dditable.pfnEncodeIm2ColExp = pDdiTable->pfnEncodeIm2ColExp;
+    pDdiTable->pfnEncodeIm2ColExp =
+        ur_tracing_layer::urTensorMapEncodeIm2ColExp;
+
+    dditable.pfnEncodeTiledExp = pDdiTable->pfnEncodeTiledExp;
+    pDdiTable->pfnEncodeTiledExp = ur_tracing_layer::urTensorMapEncodeTiledExp;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's USM table
 ///        with current process' addresses
 ///
@@ -10610,6 +10799,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->Sampler);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_tracing_layer::urGetTensorMapExpProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->TensorMapExp);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_tracing_layer::urGetUSMProcAddrTable(UR_API_VERSION_CURRENT,
                                                          &dditable->USM);
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..a46da9af2b 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -10274,6 +10274,221 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeIm2ColExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    auto pfnEncodeIm2ColExp =
+        getContext()->urDdiTable.TensorMapExp.pfnEncodeIm2ColExp;
+
+    if (nullptr == pfnEncodeIm2ColExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (getContext()->enableParameterValidation) {
+        if (NULL == hDevice) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == GlobalAddress) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == GlobalDim) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == GlobalStrides) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == PixelBoxLowerCorner) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == PixelBoxUpperCorner) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == ElementStrides) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == hTensorMap) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+    }
+
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hDevice)) {
+        getContext()->refCountContext->logInvalidReference(hDevice);
+    }
+
+    ur_result_t result = pfnEncodeIm2ColExp(
+        hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
+        GlobalStrides, PixelBoxLowerCorner, PixelBoxUpperCorner,
+        ChannelsPerPixel, PixelsPerColumn, ElementStrides, Interleave, Swizzle,
+        L2Promotion, OobFill, hTensorMap);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeTiledExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    auto pfnEncodeTiledExp =
+        getContext()->urDdiTable.TensorMapExp.pfnEncodeTiledExp;
+
+    if (nullptr == pfnEncodeTiledExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (getContext()->enableParameterValidation) {
+        if (NULL == hDevice) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == GlobalAddress) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == GlobalDim) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == GlobalStrides) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == BoxDim) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == ElementStrides) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == hTensorMap) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+
+        if (UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+    }
+
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hDevice)) {
+        getContext()->refCountContext->logInvalidReference(hDevice);
+    }
+
+    ur_result_t result = pfnEncodeTiledExp(
+        hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
+        GlobalStrides, BoxDim, ElementStrides, Interleave, Swizzle, L2Promotion,
+        OobFill, hTensorMap);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Global table
 ///        with current process' addresses
@@ -11344,6 +11559,45 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TensorMapExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+UR_DLLEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_tensor_map_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    auto &dditable = ur_validation_layer::getContext()->urDdiTable.TensorMapExp;
+
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_validation_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_validation_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    dditable.pfnEncodeIm2ColExp = pDdiTable->pfnEncodeIm2ColExp;
+    pDdiTable->pfnEncodeIm2ColExp =
+        ur_validation_layer::urTensorMapEncodeIm2ColExp;
+
+    dditable.pfnEncodeTiledExp = pDdiTable->pfnEncodeTiledExp;
+    pDdiTable->pfnEncodeTiledExp =
+        ur_validation_layer::urTensorMapEncodeTiledExp;
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's USM table
 ///        with current process' addresses
@@ -11711,6 +11965,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->Sampler);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_validation_layer::urGetTensorMapExpProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->TensorMapExp);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_validation_layer::urGetUSMProcAddrTable(
             UR_API_VERSION_CURRENT, &dditable->USM);
diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in
index a336da153d..5ca6d99113 100644
--- a/source/loader/loader.def.in
+++ b/source/loader/loader.def.in
@@ -119,6 +119,7 @@ EXPORTS
 	urGetProgramProcAddrTable
 	urGetQueueProcAddrTable
 	urGetSamplerProcAddrTable
+	urGetTensorMapExpProcAddrTable
 	urGetUSMExpProcAddrTable
 	urGetUSMProcAddrTable
 	urGetUsmP2PExpProcAddrTable
@@ -331,6 +332,11 @@ EXPORTS
 	urPrintExpSamplerCubemapFilterMode
 	urPrintExpSamplerCubemapProperties
 	urPrintExpSamplerMipProperties
+	urPrintExpTensorMapDataTypeFlags
+	urPrintExpTensorMapInterleaveFlags
+	urPrintExpTensorMapL2PromotionFlags
+	urPrintExpTensorMapOobFillFlags
+	urPrintExpTensorMapSwizzleFlags
 	urPrintExpWin32Handle
 	urPrintFunction
 	urPrintFunctionParams
@@ -465,6 +471,8 @@ EXPORTS
 	urPrintSamplerRetainParams
 	urPrintSpecializationConstantInfo
 	urPrintStructureType
+	urPrintTensorMapEncodeIm_2ColExpParams
+	urPrintTensorMapEncodeTiledExpParams
 	urPrintUsmAdviceFlags
 	urPrintUsmAllocInfo
 	urPrintUsmAllocLocationDesc
@@ -535,6 +543,8 @@ EXPORTS
 	urSamplerGetNativeHandle
 	urSamplerRelease
 	urSamplerRetain
+	urTensorMapEncodeIm2ColExp
+	urTensorMapEncodeTiledExp
 	urUSMDeviceAlloc
 	urUSMFree
 	urUSMGetMemAllocInfo
diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in
index 59a8a8d107..706d28dd01 100644
--- a/source/loader/loader.map.in
+++ b/source/loader/loader.map.in
@@ -119,6 +119,7 @@
 		urGetProgramProcAddrTable;
 		urGetQueueProcAddrTable;
 		urGetSamplerProcAddrTable;
+		urGetTensorMapExpProcAddrTable;
 		urGetUSMExpProcAddrTable;
 		urGetUSMProcAddrTable;
 		urGetUsmP2PExpProcAddrTable;
@@ -331,6 +332,11 @@
 		urPrintExpSamplerCubemapFilterMode;
 		urPrintExpSamplerCubemapProperties;
 		urPrintExpSamplerMipProperties;
+		urPrintExpTensorMapDataTypeFlags;
+		urPrintExpTensorMapInterleaveFlags;
+		urPrintExpTensorMapL2PromotionFlags;
+		urPrintExpTensorMapOobFillFlags;
+		urPrintExpTensorMapSwizzleFlags;
 		urPrintExpWin32Handle;
 		urPrintFunction;
 		urPrintFunctionParams;
@@ -465,6 +471,8 @@
 		urPrintSamplerRetainParams;
 		urPrintSpecializationConstantInfo;
 		urPrintStructureType;
+		urPrintTensorMapEncodeIm_2ColExpParams;
+		urPrintTensorMapEncodeTiledExpParams;
 		urPrintUsmAdviceFlags;
 		urPrintUsmAllocInfo;
 		urPrintUsmAllocLocationDesc;
@@ -535,6 +543,8 @@
 		urSamplerGetNativeHandle;
 		urSamplerRelease;
 		urSamplerRetain;
+		urTensorMapEncodeIm2ColExp;
+		urTensorMapEncodeTiledExp;
 		urUSMDeviceAlloc;
 		urUSMFree;
 		urUSMGetMemAllocInfo;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 86a6ad95a0..598e92c311 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -9364,6 +9364,149 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeIm2ColExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    [[maybe_unused]] auto context = getContext();
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_device_object_t *>(hDevice)->dditable;
+    auto pfnEncodeIm2ColExp = dditable->ur.TensorMapExp.pfnEncodeIm2ColExp;
+    if (nullptr == pfnEncodeIm2ColExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;
+
+    // forward to device-platform
+    result = pfnEncodeIm2ColExp(
+        hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
+        GlobalStrides, PixelBoxLowerCorner, PixelBoxUpperCorner,
+        ChannelsPerPixel, PixelsPerColumn, ElementStrides, Interleave, Swizzle,
+        L2Promotion, OobFill, hTensorMap);
+
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        // convert platform handle to loader handle
+        *hTensorMap = reinterpret_cast<ur_exp_tensor_map_handle_t>(
+            context->factories.ur_exp_tensor_map_factory.getInstance(
+                *hTensorMap, dditable));
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urTensorMapEncodeTiledExp
+__urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    [[maybe_unused]] auto context = getContext();
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_device_object_t *>(hDevice)->dditable;
+    auto pfnEncodeTiledExp = dditable->ur.TensorMapExp.pfnEncodeTiledExp;
+    if (nullptr == pfnEncodeTiledExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;
+
+    // forward to device-platform
+    result = pfnEncodeTiledExp(hDevice, TensorMapType, TensorRank,
+                               GlobalAddress, GlobalDim, GlobalStrides, BoxDim,
+                               ElementStrides, Interleave, Swizzle, L2Promotion,
+                               OobFill, hTensorMap);
+
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        // convert platform handle to loader handle
+        *hTensorMap = reinterpret_cast<ur_exp_tensor_map_handle_t>(
+            context->factories.ur_exp_tensor_map_factory.getInstance(
+                *hTensorMap, dditable));
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
 } // namespace ur_loader
 
 #if defined(__cplusplus)
@@ -10524,6 +10667,63 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TensorMapExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+UR_DLLEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_tensor_map_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (ur_loader::getContext()->version < version) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // Load the device-platform DDI tables
+    for (auto &platform : ur_loader::getContext()->platforms) {
+        if (platform.initStatus != UR_RESULT_SUCCESS) {
+            continue;
+        }
+        auto getTable = reinterpret_cast<ur_pfnGetTensorMapExpProcAddrTable_t>(
+            ur_loader::LibLoader::getFunctionPtr(
+                platform.handle.get(), "urGetTensorMapExpProcAddrTable"));
+        if (!getTable) {
+            continue;
+        }
+        platform.initStatus =
+            getTable(version, &platform.dditable.ur.TensorMapExp);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        if (ur_loader::getContext()->platforms.size() != 1 ||
+            ur_loader::getContext()->forceIntercept) {
+            // return pointers to loader's DDIs
+            pDdiTable->pfnEncodeIm2ColExp =
+                ur_loader::urTensorMapEncodeIm2ColExp;
+            pDdiTable->pfnEncodeTiledExp = ur_loader::urTensorMapEncodeTiledExp;
+        } else {
+            // return pointers directly to platform's DDIs
+            *pDdiTable = ur_loader::getContext()
+                             ->platforms.front()
+                             .dditable.ur.TensorMapExp;
+        }
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's USM table
 ///        with current process' addresses
diff --git a/source/loader/ur_ldrddi.hpp b/source/loader/ur_ldrddi.hpp
index f748500c73..309fb6cc65 100644
--- a/source/loader/ur_ldrddi.hpp
+++ b/source/loader/ur_ldrddi.hpp
@@ -87,6 +87,10 @@ using ur_exp_command_buffer_command_factory_t =
     singleton_factory_t<ur_exp_command_buffer_command_object_t,
                         ur_exp_command_buffer_command_handle_t>;
 
+using ur_exp_tensor_map_object_t = object_t<ur_exp_tensor_map_handle_t>;
+using ur_exp_tensor_map_factory_t =
+    singleton_factory_t<ur_exp_tensor_map_object_t, ur_exp_tensor_map_handle_t>;
+
 struct handle_factories {
     ur_adapter_factory_t ur_adapter_factory;
     ur_platform_factory_t ur_platform_factory;
@@ -105,6 +109,7 @@ struct handle_factories {
     ur_exp_command_buffer_factory_t ur_exp_command_buffer_factory;
     ur_exp_command_buffer_command_factory_t
         ur_exp_command_buffer_command_factory;
+    ur_exp_tensor_map_factory_t ur_exp_tensor_map_factory;
 };
 
 } // namespace ur_loader
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..3129eec2f7 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -9551,4 +9551,152 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with image data
+///
+/// @details
+///     - Map encode using im2col.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == PixelBoxLowerCorner`
+///         + `NULL == PixelBoxUpperCorner`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+    ) try {
+    auto pfnEncodeIm2ColExp =
+        ur_lib::getContext()->urDdiTable.TensorMapExp.pfnEncodeIm2ColExp;
+    if (nullptr == pfnEncodeIm2ColExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnEncodeIm2ColExp(hDevice, TensorMapType, TensorRank, GlobalAddress,
+                              GlobalDim, GlobalStrides, PixelBoxLowerCorner,
+                              PixelBoxUpperCorner, ChannelsPerPixel,
+                              PixelsPerColumn, ElementStrides, Interleave,
+                              Swizzle, L2Promotion, OobFill, hTensorMap);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with tiled data
+///
+/// @details
+///     - Tiled map encode.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == BoxDim`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+    ) try {
+    auto pfnEncodeTiledExp =
+        ur_lib::getContext()->urDdiTable.TensorMapExp.pfnEncodeTiledExp;
+    if (nullptr == pfnEncodeTiledExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnEncodeTiledExp(hDevice, TensorMapType, TensorRank, GlobalAddress,
+                             GlobalDim, GlobalStrides, BoxDim, ElementStrides,
+                             Interleave, Swizzle, L2Promotion, OobFill,
+                             hTensorMap);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 } // extern "C"
diff --git a/source/loader/ur_libddi.cpp b/source/loader/ur_libddi.cpp
index 4d88bb2044..910cbfe607 100644
--- a/source/loader/ur_libddi.cpp
+++ b/source/loader/ur_libddi.cpp
@@ -99,6 +99,11 @@ __urdlllocal ur_result_t context_t::ddiInit() {
                                            &urDdiTable.Sampler);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = urGetTensorMapExpProcAddrTable(UR_API_VERSION_CURRENT,
+                                                &urDdiTable.TensorMapExp);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = urGetUSMProcAddrTable(UR_API_VERSION_CURRENT, &urDdiTable.USM);
     }
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 6b1cbfd5ee..690f562af4 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1109,6 +1109,49 @@ ur_result_t urPrintExpEnqueueNativeCommandProperties(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t
+urPrintExpTensorMapDataTypeFlags(enum ur_exp_tensor_map_data_type_flag_t value,
+                                 char *buffer, const size_t buff_size,
+                                 size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t urPrintExpTensorMapInterleaveFlags(
+    enum ur_exp_tensor_map_interleave_flag_t value, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t urPrintExpTensorMapL2PromotionFlags(
+    enum ur_exp_tensor_map_l2_promotion_flag_t value, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t
+urPrintExpTensorMapSwizzleFlags(enum ur_exp_tensor_map_swizzle_flag_t value,
+                                char *buffer, const size_t buff_size,
+                                size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t
+urPrintExpTensorMapOobFillFlags(enum ur_exp_tensor_map_oob_fill_flag_t value,
+                                char *buffer, const size_t buff_size,
+                                size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t
 urPrintAdapterGetParams(const struct ur_adapter_get_params_t *params,
                         char *buffer, const size_t buff_size,
@@ -2508,6 +2551,22 @@ ur_result_t urPrintSamplerCreateWithNativeHandleParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintTensorMapEncodeIm_2ColExpParams(
+    const struct ur_tensor_map_encode_im_2_col_exp_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t urPrintTensorMapEncodeTiledExpParams(
+    const struct ur_tensor_map_encode_tiled_exp_params_t *params, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t
 urPrintUsmHostAllocParams(const struct ur_usm_host_alloc_params_t *params,
                           char *buffer, const size_t buff_size,
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..5d1632ce18 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -8099,3 +8099,130 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     ur_result_t result = UR_RESULT_SUCCESS;
     return result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with image data
+///
+/// @details
+///     - Map encode using im2col.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == PixelBoxLowerCorner`
+///         + `NULL == PixelBoxUpperCorner`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const int *
+        PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
+    const int *
+        PixelBoxUpperCorner, ///< [in] Array containing DHW dimensions of upper box corner.
+    uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
+    uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Encode tensor map with tiled data
+///
+/// @details
+///     - Tiled map encode.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hDevice`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_EXP_TENSOR_MAP_DATA_TYPE_FLAGS_MASK & TensorMapType`
+///         + `::UR_EXP_TENSOR_MAP_INTERLEAVE_FLAGS_MASK & Interleave`
+///         + `::UR_EXP_TENSOR_MAP_SWIZZLE_FLAGS_MASK & Swizzle`
+///         + `::UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAGS_MASK & L2Promotion`
+///         + `::UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == GlobalAddress`
+///         + `NULL == GlobalDim`
+///         + `NULL == GlobalStrides`
+///         + `NULL == BoxDim`
+///         + `NULL == ElementStrides`
+///         + `NULL == hTensorMap`
+ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice, ///< [in] Handle of the device object.
+    ur_exp_tensor_map_data_type_flags_t
+        TensorMapType,   ///< [in] Data type of the tensor object.
+    uint32_t TensorRank, ///< [in] Dimensionality of tensor; must be at least 3.
+    void *
+        GlobalAddress, ///< [in] Starting address of memory region described by tensor.
+    const uint64_t *
+        GlobalDim, ///< [in] Array containing tensor size (number of elements) along each of
+                   ///< the TensorRank dimensions.
+    const uint64_t *
+        GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
+                       ///< tensorRank - 1 dimensions.
+    const uint32_t *
+        BoxDim, ///< [in] Array containing traversal box size (number of elments) along
+    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< traversed along each tensor dimension.
+    const uint32_t *
+        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+                        ///< dimensions.
+    ur_exp_tensor_map_interleave_flags_t
+        Interleave, ///< [in] Type of interleaved layout the tensor addresses
+    ur_exp_tensor_map_swizzle_flags_t
+        Swizzle, ///< [in] Bank swizzling pattern inside shared memory
+    ur_exp_tensor_map_l2_promotion_flags_t
+        L2Promotion, ///< [in] L2 promotion size.
+    ur_exp_tensor_map_oob_fill_flags_t
+        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
+                 ///< fill out-of-bound elements.
+    ur_exp_tensor_map_handle_t
+        *hTensorMap ///< [out] Handle of the tensor map object.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}

From 69038e603e9589f89fc4596b6c3e5c6a9139eab8 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 3 Jul 2024 11:07:18 +0100
Subject: [PATCH 046/148] Add CUDA impl

---
 source/adapters/cuda/CMakeLists.txt |   1 +
 source/adapters/cuda/tensor_map.cpp | 142 ++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 source/adapters/cuda/tensor_map.cpp

diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index b6b153a5d8..3d0418fd07 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -38,6 +38,7 @@ add_ur_adapter(${TARGET_NAME}
     ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
diff --git a/source/adapters/cuda/tensor_map.cpp b/source/adapters/cuda/tensor_map.cpp
new file mode 100644
index 0000000000..9d9559fd09
--- /dev/null
+++ b/source/adapters/cuda/tensor_map.cpp
@@ -0,0 +1,142 @@
+//===--------- tensor_map.cpp - CUDA Adapter ------------------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include "context.hpp"
+
+struct ur_exp_tensor_map_handle_t_ {
+  CUtensorMap Map;
+};
+
+#define CONVERT(URTYPE, CUTYPE)                                                \
+  if (URTYPE & UrType)                                                         \
+    return CUTYPE;
+
+inline CUtensorMapDataType
+convertUrToCuDataType(ur_exp_tensor_map_data_type_flags_t UrType) {
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT8,
+          CU_TENSOR_MAP_DATA_TYPE_UINT8);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT16,
+          CU_TENSOR_MAP_DATA_TYPE_UINT16);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT32,
+          CU_TENSOR_MAP_DATA_TYPE_UINT32);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT32,
+          CU_TENSOR_MAP_DATA_TYPE_INT32);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_UINT64,
+          CU_TENSOR_MAP_DATA_TYPE_UINT64);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_INT64,
+          CU_TENSOR_MAP_DATA_TYPE_INT64);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT16,
+          CU_TENSOR_MAP_DATA_TYPE_FLOAT16);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32,
+          CU_TENSOR_MAP_DATA_TYPE_FLOAT32);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT64,
+          CU_TENSOR_MAP_DATA_TYPE_FLOAT64);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_BFLOAT16,
+          CU_TENSOR_MAP_DATA_TYPE_BFLOAT16);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_FLOAT32_FTZ,
+          CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32,
+          CU_TENSOR_MAP_DATA_TYPE_TFLOAT32);
+  CONVERT(UR_EXP_TENSOR_MAP_DATA_TYPE_FLAG_TFLOAT32_FTZ,
+          CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ);
+  throw "convertUrToCuDataType failed!";
+}
+
+CUtensorMapInterleave
+convertUrToCuInterleave(ur_exp_tensor_map_interleave_flags_t UrType) {
+  CONVERT(UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_NONE,
+          CU_TENSOR_MAP_INTERLEAVE_NONE);
+  CONVERT(UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_16B, CU_TENSOR_MAP_INTERLEAVE_16B);
+  CONVERT(UR_EXP_TENSOR_MAP_INTERLEAVE_FLAG_32B, CU_TENSOR_MAP_INTERLEAVE_32B);
+  throw "convertUrToCuInterleave failed!";
+}
+
+CUtensorMapSwizzle
+convertUrToCuSwizzle(ur_exp_tensor_map_swizzle_flags_t UrType) {
+  CONVERT(UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_NONE, CU_TENSOR_MAP_SWIZZLE_NONE);
+  CONVERT(UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_32B, CU_TENSOR_MAP_SWIZZLE_32B);
+  CONVERT(UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_64B, CU_TENSOR_MAP_SWIZZLE_64B);
+  CONVERT(UR_EXP_TENSOR_MAP_SWIZZLE_FLAG_128B, CU_TENSOR_MAP_SWIZZLE_128B);
+  throw "convertUrToCuSwizzle failed!";
+}
+
+CUtensorMapL2promotion
+convertUrToL2promotion(ur_exp_tensor_map_l2_promotion_flags_t UrType) {
+  CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE,
+          CU_TENSOR_MAP_L2_PROMOTION_NONE);
+  CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B,
+          CU_TENSOR_MAP_L2_PROMOTION_L2_64B);
+  CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_128B,
+          CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
+  CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_256B,
+          CU_TENSOR_MAP_L2_PROMOTION_L2_256B);
+  throw "convertUrToCul2promotion failed!";
+}
+
+CUtensorMapFloatOOBfill
+convertUrToCuOOBfill(ur_exp_tensor_map_oob_fill_flags_t UrType) {
+  CONVERT(UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE,
+          CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+  CONVERT(UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA,
+          CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA);
+  throw "convertUrToCuDataOOBfill failed!";
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice,
+    ur_exp_tensor_map_data_type_flags_t TensorMapType, uint32_t TensorRank,
+    void *GlobalAddress, const uint64_t *GlobalDim,
+    const uint64_t *GlobalStrides, const int *PixelBoxLowerCorner,
+    const int *PixelBoxUpperCorner, uint32_t ChannelsPerPixel,
+    uint32_t PixelsPerColumn, const uint32_t *ElementStrides,
+    ur_exp_tensor_map_interleave_flags_t Interleave,
+    ur_exp_tensor_map_swizzle_flags_t Swizzle,
+    ur_exp_tensor_map_l2_promotion_flags_t L2Promotion,
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,
+    ur_exp_tensor_map_handle_t *hTensorMap) {
+  ScopedContext Active(hDevice);
+  try {
+    UR_CHECK_ERROR(cuTensorMapEncodeIm2col(
+        &(*hTensorMap)->Map, convertUrToCuDataType(TensorMapType), TensorRank,
+        GlobalAddress, GlobalDim, GlobalStrides, PixelBoxLowerCorner,
+        PixelBoxUpperCorner, ChannelsPerPixel, PixelsPerColumn, ElementStrides,
+        convertUrToCuInterleave(Interleave), convertUrToCuSwizzle(Swizzle),
+        convertUrToL2promotion(L2Promotion), convertUrToCuOOBfill(OobFill)));
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+  return UR_RESULT_SUCCESS;
+}
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t hDevice,
+    ur_exp_tensor_map_data_type_flags_t TensorMapType, uint32_t TensorRank,
+    void *GlobalAddress, const uint64_t *GlobalDim,
+    const uint64_t *GlobalStrides, const uint32_t *BoxDim,
+    const uint32_t *ElementStrides,
+    ur_exp_tensor_map_interleave_flags_t Interleave,
+    ur_exp_tensor_map_swizzle_flags_t Swizzle,
+    ur_exp_tensor_map_l2_promotion_flags_t L2Promotion,
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,
+    ur_exp_tensor_map_handle_t *hTensorMap) {
+  ScopedContext Active(hDevice);
+  try {
+    UR_CHECK_ERROR(cuTensorMapEncodeTiled(
+        &(*hTensorMap)->Map, convertUrToCuDataType(TensorMapType), TensorRank,
+        GlobalAddress, GlobalDim, GlobalStrides, BoxDim, ElementStrides,
+        convertUrToCuInterleave(Interleave), convertUrToCuSwizzle(Swizzle),
+        convertUrToL2promotion(L2Promotion), convertUrToCuOOBfill(OobFill)));
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+  return UR_RESULT_SUCCESS;
+}

From 05492e66beea3e0f2218e6c2c63d2241e1404c1e Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 4 Jul 2024 15:27:34 +0100
Subject: [PATCH 047/148] Respond to comments

- Check that TensorDim < 3 using yaml returns: .
- Rename some things and remove copypasta
---
 include/ur_api.h                              | 24 ++++++++++-------
 scripts/core/exp-tensor-map.yml               | 22 ++++++++++------
 source/adapters/cuda/tensor_map.cpp           | 12 ++++-----
 source/adapters/mock/ur_mockddi.cpp           | 18 ++++++-------
 source/loader/layers/tracing/ur_trcddi.cpp    | 18 ++++++-------
 source/loader/layers/validation/ur_valddi.cpp | 26 ++++++++++++-------
 source/loader/ur_ldrddi.cpp                   | 18 ++++++-------
 source/loader/ur_libapi.cpp                   | 22 +++++++++-------
 source/ur_api.cpp                             | 22 +++++++++-------
 9 files changed, 104 insertions(+), 78 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 13334a9c8e..8d4e6e5972 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -10166,7 +10166,7 @@ urEnqueueNativeCommandExp(
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
-// Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs
+// Intel 'oneAPI' Unified Runtime Experimental API for mapping tensor objects
 #if !defined(__GNUC__)
 #pragma region tensor map(experimental)
 #endif
@@ -10287,6 +10287,8 @@ typedef enum ur_exp_tensor_map_oob_fill_flag_t {
 ///         + `NULL == PixelBoxUpperCorner`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 UR_APIEXPORT ur_result_t UR_APICALL
 urTensorMapEncodeIm2ColExp(
     ur_device_handle_t hDevice,                         ///< [in] Handle of the device object.
@@ -10296,18 +10298,18 @@ urTensorMapEncodeIm2ColExp(
     const uint64_t *GlobalDim,                          ///< [in] Array containing tensor size (number of elements) along each of
                                                         ///< the TensorRank dimensions.
     const uint64_t *GlobalStrides,                      ///< [in] Array containing stride size (in bytes) along each of the
-                                                        ///< tensorRank - 1 dimensions.
+                                                        ///< TensorRank - 1 dimensions.
     const int *PixelBoxLowerCorner,                     ///< [in] Array containing DHW dimensions of lower box corner.
     const int *PixelBoxUpperCorner,                     ///< [in] Array containing DHW dimensions of upper box corner.
     uint32_t ChannelsPerPixel,                          ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,                           ///< [in] Number of pixels per column.
-    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the tensorRank
+    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the TensorRank
                                                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t Interleave,    ///< [in] Type of interleaved layout the tensor addresses
     ur_exp_tensor_map_swizzle_flags_t Swizzle,          ///< [in] Bank swizzling pattern inside shared memory
     ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, ///< [in] L2 promotion size.
-    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicate whether zero or special NaN constant will be used to
-                                                        ///< fill out-of-bound elements.
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicates whether zero or special NaN constant will be used to
+                                                        ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t *hTensorMap              ///< [out] Handle of the tensor map object.
 );
 
@@ -10337,6 +10339,8 @@ urTensorMapEncodeIm2ColExp(
 ///         + `NULL == BoxDim`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 UR_APIEXPORT ur_result_t UR_APICALL
 urTensorMapEncodeTiledExp(
     ur_device_handle_t hDevice,                         ///< [in] Handle of the device object.
@@ -10346,17 +10350,17 @@ urTensorMapEncodeTiledExp(
     const uint64_t *GlobalDim,                          ///< [in] Array containing tensor size (number of elements) along each of
                                                         ///< the TensorRank dimensions.
     const uint64_t *GlobalStrides,                      ///< [in] Array containing stride size (in bytes) along each of the
-                                                        ///< tensorRank - 1 dimensions.
+                                                        ///< TensorRank - 1 dimensions.
     const uint32_t *BoxDim,                             ///< [in] Array containing traversal box size (number of elments) along
-                                                        ///< each of the tensorRank dimensions. Specifies how many elements to be
+                                                        ///< each of the TensorRank dimensions. Specifies how many elements to be
                                                         ///< traversed along each tensor dimension.
-    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the tensorRank
+    const uint32_t *ElementStrides,                     ///< [in] Array containing traversal stride in each of the TensorRank
                                                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t Interleave,    ///< [in] Type of interleaved layout the tensor addresses
     ur_exp_tensor_map_swizzle_flags_t Swizzle,          ///< [in] Bank swizzling pattern inside shared memory
     ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, ///< [in] L2 promotion size.
-    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicate whether zero or special NaN constant will be used to
-                                                        ///< fill out-of-bound elements.
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,         ///< [in] Indicates whether zero or special NaN constant will be used to
+                                                        ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t *hTensorMap              ///< [out] Handle of the tensor map object.
 );
 
diff --git a/scripts/core/exp-tensor-map.yml b/scripts/core/exp-tensor-map.yml
index 258a2403f0..fa1e8c1898 100644
--- a/scripts/core/exp-tensor-map.yml
+++ b/scripts/core/exp-tensor-map.yml
@@ -9,7 +9,7 @@
 #
 --- #--------------------------------------------------------------------------
 type: header
-desc: "Intel $OneApi Unified Runtime Experimental API for enqueuing work through native APIs"
+desc: "Intel $OneApi Unified Runtime Experimental API for mapping tensor objects"
 ordinal: "100"
 
 --- #--------------------------------------------------------------------------
@@ -125,7 +125,7 @@ params:
       desc: "[in] Array containing tensor size (number of elements) along each of the TensorRank dimensions."
     - type: const uint64_t*
       name: GlobalStrides
-      desc: "[in] Array containing stride size (in bytes) along each of the tensorRank - 1 dimensions."
+      desc: "[in] Array containing stride size (in bytes) along each of the TensorRank - 1 dimensions."
     - type: const int*
       name: PixelBoxLowerCorner
       desc: "[in] Array containing DHW dimensions of lower box corner."
@@ -140,7 +140,7 @@ params:
       desc: "[in] Number of pixels per column."
     - type: const uint32_t*
       name: ElementStrides
-      desc: "[in] Array containing traversal stride in each of the tensorRank dimensions."
+      desc: "[in] Array containing traversal stride in each of the TensorRank dimensions."
     - type: $x_exp_tensor_map_interleave_flags_t
       name: Interleave
       desc: "[in] Type of interleaved layout the tensor addresses"
@@ -152,10 +152,13 @@ params:
       desc: "[in] L2 promotion size."
     - type: $x_exp_tensor_map_oob_fill_flags_t
       name: OobFill
-      desc: "[in] Indicate whether zero or special NaN constant will be used to fill out-of-bound elements."
+      desc: "[in] Indicates whether zero or special NaN constant will be used to fill out-of-bounds elements."
     - type: $x_exp_tensor_map_handle_t*
       name: hTensorMap
       desc: "[out] Handle of the tensor map object."
+returns:
+    - $X_RESULT_ERROR_INVALID_ARGUMENT:
+        - "`TensorRank < 3`"
 
 --- #--------------------------------------------------------------------------
 type: function
@@ -182,13 +185,13 @@ params:
       desc: "[in] Array containing tensor size (number of elements) along each of the TensorRank dimensions."
     - type: const uint64_t*
       name: GlobalStrides
-      desc: "[in] Array containing stride size (in bytes) along each of the tensorRank - 1 dimensions."
+      desc: "[in] Array containing stride size (in bytes) along each of the TensorRank - 1 dimensions."
     - type: const uint32_t*
       name: BoxDim
-      desc: "[in] Array containing traversal box size (number of elments) along each of the tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension."
+      desc: "[in] Array containing traversal box size (number of elments) along each of the TensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension."
     - type: const uint32_t*
       name: ElementStrides
-      desc: "[in] Array containing traversal stride in each of the tensorRank dimensions."
+      desc: "[in] Array containing traversal stride in each of the TensorRank dimensions."
     - type: $x_exp_tensor_map_interleave_flags_t
       name: Interleave
       desc: "[in] Type of interleaved layout the tensor addresses"
@@ -200,8 +203,11 @@ params:
       desc: "[in] L2 promotion size."
     - type: $x_exp_tensor_map_oob_fill_flags_t
       name: OobFill
-      desc: "[in] Indicate whether zero or special NaN constant will be used to fill out-of-bound elements."
+      desc: "[in] Indicates whether zero or special NaN constant will be used to fill out-of-bounds elements."
     - type: $x_exp_tensor_map_handle_t*
       name: hTensorMap
       desc: "[out] Handle of the tensor map object."
+returns:
+    - $X_RESULT_ERROR_INVALID_ARGUMENT:
+        - "`TensorRank < 3`"
 
diff --git a/source/adapters/cuda/tensor_map.cpp b/source/adapters/cuda/tensor_map.cpp
index 9d9559fd09..da8e4f8f8c 100644
--- a/source/adapters/cuda/tensor_map.cpp
+++ b/source/adapters/cuda/tensor_map.cpp
@@ -18,8 +18,8 @@ struct ur_exp_tensor_map_handle_t_ {
 };
 
 #define CONVERT(URTYPE, CUTYPE)                                                \
-  if (URTYPE & UrType)                                                         \
-    return CUTYPE;
+  if ((URTYPE)&UrType)                                                         \
+    return (CUTYPE);
 
 inline CUtensorMapDataType
 convertUrToCuDataType(ur_exp_tensor_map_data_type_flags_t UrType) {
@@ -71,7 +71,7 @@ convertUrToCuSwizzle(ur_exp_tensor_map_swizzle_flags_t UrType) {
 }
 
 CUtensorMapL2promotion
-convertUrToL2promotion(ur_exp_tensor_map_l2_promotion_flags_t UrType) {
+convertUrToCuL2Promotion(ur_exp_tensor_map_l2_promotion_flags_t UrType) {
   CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_NONE,
           CU_TENSOR_MAP_L2_PROMOTION_NONE);
   CONVERT(UR_EXP_TENSOR_MAP_L2_PROMOTION_FLAG_64B,
@@ -84,7 +84,7 @@ convertUrToL2promotion(ur_exp_tensor_map_l2_promotion_flags_t UrType) {
 }
 
 CUtensorMapFloatOOBfill
-convertUrToCuOOBfill(ur_exp_tensor_map_oob_fill_flags_t UrType) {
+convertUrToCuOobFill(ur_exp_tensor_map_oob_fill_flags_t UrType) {
   CONVERT(UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_NONE,
           CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   CONVERT(UR_EXP_TENSOR_MAP_OOB_FILL_FLAG_REQUEST_ZERO_FMA,
@@ -111,7 +111,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
         GlobalAddress, GlobalDim, GlobalStrides, PixelBoxLowerCorner,
         PixelBoxUpperCorner, ChannelsPerPixel, PixelsPerColumn, ElementStrides,
         convertUrToCuInterleave(Interleave), convertUrToCuSwizzle(Swizzle),
-        convertUrToL2promotion(L2Promotion), convertUrToCuOOBfill(OobFill)));
+        convertUrToCuL2Promotion(L2Promotion), convertUrToCuOobFill(OobFill)));
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -134,7 +134,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
         &(*hTensorMap)->Map, convertUrToCuDataType(TensorMapType), TensorRank,
         GlobalAddress, GlobalDim, GlobalStrides, BoxDim, ElementStrides,
         convertUrToCuInterleave(Interleave), convertUrToCuSwizzle(Swizzle),
-        convertUrToL2promotion(L2Promotion), convertUrToCuOOBfill(OobFill)));
+        convertUrToCuL2Promotion(L2Promotion), convertUrToCuOobFill(OobFill)));
   } catch (ur_result_t Err) {
     return Err;
   }
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index f2849e73ff..ec0be3890f 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10745,7 +10745,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -10753,7 +10753,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -10762,8 +10762,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
     ) try {
@@ -10835,13 +10835,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -10850,8 +10850,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
     ) try {
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 5b28fd9f30..b6be9b242f 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -9235,7 +9235,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -9243,7 +9243,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9252,8 +9252,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
@@ -9318,13 +9318,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9333,8 +9333,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index a46da9af2b..1701ee4725 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -10288,7 +10288,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -10296,7 +10296,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -10305,8 +10305,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
@@ -10369,6 +10369,10 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
         if (UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
+
+        if (TensorRank < 3) {
+            return UR_RESULT_ERROR_INVALID_ARGUMENT;
+        }
     }
 
     if (getContext()->enableLifetimeValidation &&
@@ -10399,13 +10403,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -10414,8 +10418,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
@@ -10474,6 +10478,10 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
         if (UR_EXP_TENSOR_MAP_OOB_FILL_FLAGS_MASK & OobFill) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
+
+        if (TensorRank < 3) {
+            return UR_RESULT_ERROR_INVALID_ARGUMENT;
+        }
     }
 
     if (getContext()->enableLifetimeValidation &&
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 598e92c311..2409738fbf 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -9378,7 +9378,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -9386,7 +9386,7 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9395,8 +9395,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
@@ -9451,13 +9451,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9466,8 +9466,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3129eec2f7..d83ec2e829 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -9578,6 +9578,8 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp(
 ///         + `NULL == PixelBoxUpperCorner`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_device_handle_t hDevice, ///< [in] Handle of the device object.
     ur_exp_tensor_map_data_type_flags_t
@@ -9590,7 +9592,7 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -9598,7 +9600,7 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9607,8 +9609,8 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
     ) try {
@@ -9653,6 +9655,8 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
 ///         + `NULL == BoxDim`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_device_handle_t hDevice, ///< [in] Handle of the device object.
     ur_exp_tensor_map_data_type_flags_t
@@ -9665,13 +9669,13 @@ ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -9680,8 +9684,8 @@ ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
     ) try {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 5d1632ce18..7be7628651 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -8127,6 +8127,8 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp(
 ///         + `NULL == PixelBoxUpperCorner`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_device_handle_t hDevice, ///< [in] Handle of the device object.
     ur_exp_tensor_map_data_type_flags_t
@@ -8139,7 +8141,7 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const int *
         PixelBoxLowerCorner, ///< [in] Array containing DHW dimensions of lower box corner.
     const int *
@@ -8147,7 +8149,7 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     uint32_t ChannelsPerPixel, ///< [in] Number of channels per pixel.
     uint32_t PixelsPerColumn,  ///< [in] Number of pixels per column.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -8156,8 +8158,8 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {
@@ -8191,6 +8193,8 @@ ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
 ///         + `NULL == BoxDim`
 ///         + `NULL == ElementStrides`
 ///         + `NULL == hTensorMap`
+///     - ::UR_RESULT_ERROR_INVALID_ARGUMENT
+///         + `TensorRank < 3`
 ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_device_handle_t hDevice, ///< [in] Handle of the device object.
     ur_exp_tensor_map_data_type_flags_t
@@ -8203,13 +8207,13 @@ ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                    ///< the TensorRank dimensions.
     const uint64_t *
         GlobalStrides, ///< [in] Array containing stride size (in bytes) along each of the
-                       ///< tensorRank - 1 dimensions.
+                       ///< TensorRank - 1 dimensions.
     const uint32_t *
         BoxDim, ///< [in] Array containing traversal box size (number of elments) along
-    ///< each of the tensorRank dimensions. Specifies how many elements to be
+    ///< each of the TensorRank dimensions. Specifies how many elements to be
     ///< traversed along each tensor dimension.
     const uint32_t *
-        ElementStrides, ///< [in] Array containing traversal stride in each of the tensorRank
+        ElementStrides, ///< [in] Array containing traversal stride in each of the TensorRank
                         ///< dimensions.
     ur_exp_tensor_map_interleave_flags_t
         Interleave, ///< [in] Type of interleaved layout the tensor addresses
@@ -8218,8 +8222,8 @@ ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_l2_promotion_flags_t
         L2Promotion, ///< [in] L2 promotion size.
     ur_exp_tensor_map_oob_fill_flags_t
-        OobFill, ///< [in] Indicate whether zero or special NaN constant will be used to
-                 ///< fill out-of-bound elements.
+        OobFill, ///< [in] Indicates whether zero or special NaN constant will be used to
+                 ///< fill out-of-bounds elements.
     ur_exp_tensor_map_handle_t
         *hTensorMap ///< [out] Handle of the tensor map object.
 ) {

From e0635975b3861fe3dd5a034e5d0089e53586f1a5 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 15 Jul 2024 15:10:14 +0100
Subject: [PATCH 048/148] Add unsupported entry points to other adapters

---
 source/adapters/hip/CMakeLists.txt        |  1 +
 source/adapters/hip/tensor_map.cpp        | 29 +++++++++++++++++++++++
 source/adapters/level_zero/CMakeLists.txt |  1 +
 source/adapters/level_zero/tensor_map.cpp | 28 ++++++++++++++++++++++
 source/adapters/native_cpu/CMakeLists.txt |  1 +
 source/adapters/native_cpu/tensor_map.cpp | 29 +++++++++++++++++++++++
 source/adapters/opencl/CMakeLists.txt     |  1 +
 source/adapters/opencl/tensor_map.cpp     | 29 +++++++++++++++++++++++
 8 files changed, 119 insertions(+)
 create mode 100644 source/adapters/hip/tensor_map.cpp
 create mode 100644 source/adapters/level_zero/tensor_map.cpp
 create mode 100644 source/adapters/native_cpu/tensor_map.cpp
 create mode 100644 source/adapters/opencl/tensor_map.cpp

diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt
index 9113d7b1ca..36222907c6 100644
--- a/source/adapters/hip/CMakeLists.txt
+++ b/source/adapters/hip/CMakeLists.txt
@@ -86,6 +86,7 @@ add_ur_adapter(${TARGET_NAME}
     ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
diff --git a/source/adapters/hip/tensor_map.cpp b/source/adapters/hip/tensor_map.cpp
new file mode 100644
index 0000000000..59ab4932e5
--- /dev/null
+++ b/source/adapters/hip/tensor_map.cpp
@@ -0,0 +1,29 @@
+//===--------- tensor_map.cpp - HIP Adapter -------------------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const int *, const int *, uint32_t,
+    uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t, ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t, ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 05a33c1224..4e81bbd738 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -45,6 +45,7 @@ if(UR_BUILD_ADAPTER_L0)
         ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
diff --git a/source/adapters/level_zero/tensor_map.cpp b/source/adapters/level_zero/tensor_map.cpp
new file mode 100644
index 0000000000..60625cec94
--- /dev/null
+++ b/source/adapters/level_zero/tensor_map.cpp
@@ -0,0 +1,28 @@
+//===--------- tensor_map.cpp - L0 Adapter --------------------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const int *, const int *, uint32_t,
+    uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t, ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t, ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 56cfc577d8..69f7fff6bd 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -34,6 +34,7 @@ add_ur_adapter(${TARGET_NAME}
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
diff --git a/source/adapters/native_cpu/tensor_map.cpp b/source/adapters/native_cpu/tensor_map.cpp
new file mode 100644
index 0000000000..288d748ab6
--- /dev/null
+++ b/source/adapters/native_cpu/tensor_map.cpp
@@ -0,0 +1,29 @@
+//===--------- tensor_map.cpp - Native CPU Adapter ------------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const int *, const int *, uint32_t,
+    uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t, ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t, ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
diff --git a/source/adapters/opencl/CMakeLists.txt b/source/adapters/opencl/CMakeLists.txt
index a7e91f75e5..e091012bab 100644
--- a/source/adapters/opencl/CMakeLists.txt
+++ b/source/adapters/opencl/CMakeLists.txt
@@ -38,6 +38,7 @@ add_ur_adapter(${TARGET_NAME} SHARED
     ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
diff --git a/source/adapters/opencl/tensor_map.cpp b/source/adapters/opencl/tensor_map.cpp
new file mode 100644
index 0000000000..b39aaf800a
--- /dev/null
+++ b/source/adapters/opencl/tensor_map.cpp
@@ -0,0 +1,29 @@
+//===--------- tensor_map.cpp - OpenCL Adapter ----------------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const int *, const int *, uint32_t,
+    uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t,
+    ur_exp_tensor_map_swizzle_flags_t, ur_exp_tensor_map_l2_promotion_flags_t,
+    ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+    ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
+    const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *,
+    ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t,
+    ur_exp_tensor_map_l2_promotion_flags_t, ur_exp_tensor_map_oob_fill_flags_t,
+    ur_exp_tensor_map_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+

From 6e6059c20a307df0f14fd6b975dfd3e206c27923 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 15 Jul 2024 16:09:17 +0100
Subject: [PATCH 049/148] Clang format

---
 source/adapters/hip/tensor_map.cpp        | 1 -
 source/adapters/native_cpu/tensor_map.cpp | 1 -
 source/adapters/opencl/tensor_map.cpp     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/source/adapters/hip/tensor_map.cpp b/source/adapters/hip/tensor_map.cpp
index 59ab4932e5..348c4c9d05 100644
--- a/source/adapters/hip/tensor_map.cpp
+++ b/source/adapters/hip/tensor_map.cpp
@@ -26,4 +26,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
-
diff --git a/source/adapters/native_cpu/tensor_map.cpp b/source/adapters/native_cpu/tensor_map.cpp
index 288d748ab6..eb9f01b318 100644
--- a/source/adapters/native_cpu/tensor_map.cpp
+++ b/source/adapters/native_cpu/tensor_map.cpp
@@ -26,4 +26,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
-
diff --git a/source/adapters/opencl/tensor_map.cpp b/source/adapters/opencl/tensor_map.cpp
index b39aaf800a..ea2a009f88 100644
--- a/source/adapters/opencl/tensor_map.cpp
+++ b/source/adapters/opencl/tensor_map.cpp
@@ -26,4 +26,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
-

From ccde31ec616fa51b29e0d6f123bc9ad15bf9f0c6 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Fri, 25 Oct 2024 16:38:46 +0100
Subject: [PATCH 050/148] Put UR entry points in ur::level_zero

Fixes missing symbol at linking for static build of L0 adapter.
---
 source/adapters/level_zero/tensor_map.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/tensor_map.cpp b/source/adapters/level_zero/tensor_map.cpp
index 60625cec94..91d6498540 100644
--- a/source/adapters/level_zero/tensor_map.cpp
+++ b/source/adapters/level_zero/tensor_map.cpp
@@ -10,7 +10,9 @@
 
 #include <ur_api.h>
 
-UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
+namespace ur::level_zero {
+
+ur_result_t urTensorMapEncodeIm2ColExp(
     ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
     const uint64_t *, const uint64_t *, const int *, const int *, uint32_t,
     uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t,
@@ -18,7 +20,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
     ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
-UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
+
+ur_result_t urTensorMapEncodeTiledExp(
     ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *,
     const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *,
     ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t,
@@ -26,3 +29,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
     ur_exp_tensor_map_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+} // namespace ur::level_zero

From 837aa279cc3415f9f6ef3481e9c80bdec5078cd9 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 7 Nov 2024 17:40:34 +0000
Subject: [PATCH 051/148] Add ProcAddrTable Entry points

---
 source/adapters/cuda/ur_interface_loader.cpp       | 13 +++++++++++++
 source/adapters/hip/ur_interface_loader.cpp        | 13 +++++++++++++
 source/adapters/native_cpu/ur_interface_loader.cpp | 13 +++++++++++++
 source/adapters/opencl/ur_interface_loader.cpp     | 13 +++++++++++++
 4 files changed, 52 insertions(+)

diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index 4b13e6669c..cea4707a05 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -434,6 +434,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnEncodeIm2ColExp = urTensorMapEncodeIm2ColExp;
+  pDdiTable->pfnEncodeTiledExp = urTensorMapEncodeTiledExp;
+
+  return result;
+}
+
 UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
   auto result = validateProcInputs(version, pDdiTable);
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index f7ec09188f..2c9df55bb6 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -400,6 +400,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnEncodeIm2ColExp = urTensorMapEncodeIm2ColExp;
+  pDdiTable->pfnEncodeTiledExp = urTensorMapEncodeTiledExp;
+
+  return result;
+}
+
 UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
   auto result = validateProcInputs(version, pDdiTable);
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 9717f020c3..55b1e6a568 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -418,6 +418,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnEncodeIm2ColExp = urTensorMapEncodeIm2ColExp;
+  pDdiTable->pfnEncodeTiledExp = urTensorMapEncodeTiledExp;
+
+  return result;
+}
+
 UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
   auto result = validateProcInputs(version, pDdiTable);
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index 46d2bf6cdd..d51c27f6cc 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -426,6 +426,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnEncodeIm2ColExp = urTensorMapEncodeIm2ColExp;
+  pDdiTable->pfnEncodeTiledExp = urTensorMapEncodeTiledExp;
+
+  return result;
+}
+
 UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
   auto result = validateProcInputs(version, pDdiTable);

From b1a32860fd8ad8ded8656d6b5956e5a35aa38b98 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 20 Nov 2024 14:47:20 +0000
Subject: [PATCH 052/148] Fix bad merge conflicts resolution

---
 include/ur_api.h     | 4 ++--
 include/ur_print.hpp | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 8d4e6e5972..bb1a1bed3f 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -231,8 +231,8 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP = 244,              ///< Enumerator for ::urCommandBufferUpdateWaitEventsExp
     UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 245,     ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp
     UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT = 246,               ///< Enumerator for ::urEnqueueEventsWaitWithBarrierExt
-    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 230,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
-    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 231,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 247,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 248,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index dafe882726..1acde66f4f 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -987,6 +987,7 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
         break;
     case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT:
         os << "UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT";
+        break;
     case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
         os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
         break;

From 2f5ff276a9f47e7d08995079b9f8fcf13469264c Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 3 Dec 2024 14:53:25 +0000
Subject: [PATCH 053/148] Add clarifications in extension documentation

---
 scripts/core/EXP-TENSOR-MAP.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scripts/core/EXP-TENSOR-MAP.rst b/scripts/core/EXP-TENSOR-MAP.rst
index 3679f3cfd1..15a6802363 100644
--- a/scripts/core/EXP-TENSOR-MAP.rst
+++ b/scripts/core/EXP-TENSOR-MAP.rst
@@ -23,8 +23,14 @@ Tensor Mapping APIs
 Motivation
 --------------------------------------------------------------------------------
 
-Used to target the CUDA entry points cuTensorMapEncodeIm2col and
-cuTensorMapEncodeTiled.
+Used to target the CUDA entry points ``cuTensorMapEncodeIm2col`` and
+``cuTensorMapEncodeTiled``.
+
+For some tensor core operations on ``sm_90+`` Nvidia devices, a tensor
+descriptor must be built on the host and passed to the kernel. The interfaces
+mentioned above, and mapped to UR in this extension, provide the APIs necessary
+to create these tensor descriptor objects, that can then be passed to the
+kernels.
 
 API
 --------------------------------------------------------------------------------
@@ -61,7 +67,7 @@ Changelog
 Support
 --------------------------------------------------------------------------------
 
-This is only supported in the CUDA adapter.
+This extension is only supported on the ``UR_PLATFORM_BACKEND_CUDA`` backend.
 
 Contributors
 --------------------------------------------------------------------------------

From dc3ca71f52887996bd51f97daa628b67f5ca9fac Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 20 Nov 2024 12:30:52 -0800
Subject: [PATCH 054/148] [L0] Add Support for External Semaphores

- Added support for using the Intel L0 driver experimental extension for
  external sempahores.
- This implementation enables support for external semaphores that will
  exist in a future L0 Intel GPU Driver implementation.
- The functionality outlined in this commit mirrors the planned
  functionality that will be published in the next offical L0 spec release 1.12.
  Once the L0 1.12 spec is released, then the usage of the header definitions
  of this functionality will be updated to match the official spec while
  supporting the previous driver implementation.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/common.hpp   |  73 +++++++++
 source/adapters/level_zero/image.cpp    | 188 ++++++++++++++++++++----
 source/adapters/level_zero/platform.cpp |  40 +++++
 source/adapters/level_zero/platform.hpp |  22 ++-
 4 files changed, 296 insertions(+), 27 deletions(-)

diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 6dd8a614c5..8a93993752 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -531,3 +531,76 @@ extern thread_local int32_t ErrorAdapterNativeCode;
                                       int32_t AdapterErrorCode);
 
 #define L0_DRIVER_INORDER_MIN_VERSION 29534
+
+// Definitions for the External Semaphore Extension
+
+#ifndef ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME
+/// @brief Event sync mode extension name
+#define ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME                                   \
+  "ZE_intel_experimental_external_semaphore"
+#endif // ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME
+
+typedef enum _ze_intel_external_semaphore_exp_version_t {
+  ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_1_0 =
+      ZE_MAKE_VERSION(1, 0), ///< version 1.0
+  ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_CURRENT =
+      ZE_MAKE_VERSION(1, 0), ///< latest known version
+  ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_external_semaphore_exp_version_t;
+typedef enum _ze_intel_external_semaphore_exp_flags_t {
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_FD,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_WIN32,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_WIN32_KMT,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_D3D12_FENCE,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_D3D11_FENCE,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_KEYED_MUTEX,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_KEYED_MUTEX_KMT,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_TIMELINE_SEMAPHORE_FD,
+  ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_TIMELINE_SEMAPHORE_WIN32
+} ze_intel_external_semaphore_exp_flags_t;
+
+typedef struct _ze_intel_external_semaphore_exp_desc_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+  ze_intel_external_semaphore_exp_flags_t flags;
+} ze_intel_external_semaphore_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_win32_exp_desc_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+  void *handle;
+  const char *name;
+} ze_intel_external_semaphore_win32_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_fd_exp_desc_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+  int fd;
+} ze_intel_external_semaphore_desc_fd_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_signal_exp_params_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+  uint64_t value;
+} ze_intel_external_semaphore_signal_exp_params_t;
+
+typedef struct _ze_intel_external_semaphore_wait_exp_params_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+
+  uint64_t value;
+} ze_intel_external_semaphore_wait_exp_params_t;
+
+typedef struct _ze_intel_external_semaphore_exp_handle_t
+    *ze_intel_external_semaphore_exp_handle_t;
+
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_EXP_DESC                    \
+  (ze_structure_type_t)0x0003001E
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WIN32_EXP_DESC              \
+  (ze_structure_type_t)0x0003001F
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_FD_EXP_DESC                 \
+  (ze_structure_type_t)0x00030023
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_EXP           \
+  (ze_structure_type_t)0x00030024
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WAIT_PARAMS_EXP             \
+  (ze_structure_type_t)0x00030025
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 6433b1f325..8437fcff95 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -1190,41 +1190,130 @@ ur_result_t urBindlessImagesImportExternalSemaphoreExp(
     ur_exp_external_semaphore_type_t semHandleType,
     ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc,
     ur_exp_external_semaphore_handle_t *phExternalSemaphoreHandle) {
-  std::ignore = hContext;
-  std::ignore = hDevice;
-  std::ignore = semHandleType;
-  std::ignore = pExternalSemaphoreDesc;
-  std::ignore = phExternalSemaphoreHandle;
-  logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"),
-                "{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  auto UrPlatform = hContext->getPlatform();
+  if (UrPlatform->ZeExternalSemaphoreExt.Supported == false) {
+    logger::error(logger::LegacyMessage("[UR][L0] "),
+                  " {} function not supported!", __FUNCTION__);
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+  ze_intel_external_semaphore_exp_desc_t SemDesc = {
+      ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_EXP_DESC, nullptr,
+      ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_FD};
+  ze_intel_external_semaphore_exp_handle_t ExtSemaphoreHandle;
+  ze_intel_external_semaphore_desc_fd_exp_desc_t FDExpDesc = {
+      ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_FD_EXP_DESC, nullptr, 0};
+  _ze_intel_external_semaphore_win32_exp_desc_t Win32ExpDesc = {
+      ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WIN32_EXP_DESC, nullptr,
+      nullptr, nullptr};
+  void *pNext = const_cast<void *>(pExternalSemaphoreDesc->pNext);
+  while (pNext != nullptr) {
+    const ur_base_desc_t *BaseDesc = static_cast<const ur_base_desc_t *>(pNext);
+    if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) {
+      auto FileDescriptor =
+          static_cast<const ur_exp_file_descriptor_t *>(pNext);
+      FDExpDesc.fd = FileDescriptor->fd;
+      SemDesc.pNext = &FDExpDesc;
+      SemDesc.flags = ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_FD;
+    } else if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE) {
+      SemDesc.pNext = &Win32ExpDesc;
+      auto Win32Handle = static_cast<const ur_exp_win32_handle_t *>(pNext);
+      switch (semHandleType) {
+      case UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT:
+        SemDesc.flags = ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_WIN32;
+        break;
+      case UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT_DX12_FENCE:
+        SemDesc.flags = ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_D3D12_FENCE;
+        break;
+      case UR_EXP_EXTERNAL_SEMAPHORE_TYPE_OPAQUE_FD:
+        SemDesc.flags = ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_FD;
+        break;
+      default:
+        return UR_RESULT_ERROR_INVALID_VALUE;
+      }
+      Win32ExpDesc.handle = Win32Handle->handle;
+    }
+    pNext = const_cast<void *>(BaseDesc->pNext);
+  }
+
+  ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt.zexImportExternalSemaphoreExp,
+             (hDevice->ZeDevice, &ExtSemaphoreHandle, &SemDesc));
+  *phExternalSemaphoreHandle =
+      (ur_exp_external_semaphore_handle_t)ExtSemaphoreHandle;
+
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urBindlessImagesReleaseExternalSemaphoreExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_external_semaphore_handle_t hExternalSemaphore) {
-  std::ignore = hContext;
   std::ignore = hDevice;
-  std::ignore = hExternalSemaphore;
-  logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"),
-                "{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  auto UrPlatform = hContext->getPlatform();
+  if (UrPlatform->ZeExternalSemaphoreExt.Supported == false) {
+    logger::error(logger::LegacyMessage("[UR][L0] "),
+                  " {} function not supported!", __FUNCTION__);
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+  ZE2UR_CALL(
+      UrPlatform->ZeExternalSemaphoreExt.zexDeviceReleaseExternalSemaphoreExp,
+      ((ze_intel_external_semaphore_exp_handle_t)hExternalSemaphore));
+
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urBindlessImagesWaitExternalSemaphoreExp(
     ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore,
     bool hasValue, uint64_t waitValue, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
-  std::ignore = hQueue;
-  std::ignore = hSemaphore;
-  std::ignore = hasValue;
-  std::ignore = waitValue;
-  std::ignore = numEventsInWaitList;
-  std::ignore = phEventWaitList;
-  std::ignore = phEvent;
-  logger::error(logger::LegacyMessage("[UR][L0] "),
-                " {} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  auto UrPlatform = hQueue->Context->getPlatform();
+  if (UrPlatform->ZeExternalSemaphoreExt.Supported == false) {
+    logger::error(logger::LegacyMessage("[UR][L0] "),
+                  " {} function not supported!", __FUNCTION__);
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+
+  bool UseCopyEngine = false;
+
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      numEventsInWaitList, phEventWaitList, hQueue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(hQueue->Context->getAvailableCommandList(
+      hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
+      OkToBatch, nullptr /*ForcedCmdQueue*/));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = phEvent == nullptr;
+  ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(hQueue, Event,
+                                       UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP,
+                                       CommandList, IsInternal,
+                                       /*IsMultiDevice*/ false));
+  UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
+                         numEventsInWaitList, phEventWaitList,
+                         CommandList->second.ZeQueue));
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  ze_intel_external_semaphore_wait_exp_params_t WaitParams = {
+      ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WAIT_PARAMS_EXP, nullptr, 0};
+  WaitParams.value = hasValue ? waitValue : 0;
+  const ze_intel_external_semaphore_exp_handle_t hExtSemaphore =
+      reinterpret_cast<ze_intel_external_semaphore_exp_handle_t>(hSemaphore);
+  ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt
+                 .zexCommandListAppendWaitExternalSemaphoresExp,
+             (ZeCommandList, &hExtSemaphore, &WaitParams, 1, ZeEvent,
+              WaitList.Length, WaitList.ZeEventList));
+
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urBindlessImagesSignalExternalSemaphoreExp(
@@ -1238,9 +1327,56 @@ ur_result_t urBindlessImagesSignalExternalSemaphoreExp(
   std::ignore = numEventsInWaitList;
   std::ignore = phEventWaitList;
   std::ignore = phEvent;
-  logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"),
-                "{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  auto UrPlatform = hQueue->Context->getPlatform();
+  if (UrPlatform->ZeExternalSemaphoreExt.Supported == false) {
+    logger::error(logger::LegacyMessage("[UR][L0] "),
+                  " {} function not supported!", __FUNCTION__);
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+
+  bool UseCopyEngine = false;
+
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      numEventsInWaitList, phEventWaitList, hQueue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(hQueue->Context->getAvailableCommandList(
+      hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
+      OkToBatch, nullptr /*ForcedCmdQueue*/));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = phEvent == nullptr;
+  ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(hQueue, Event,
+                                       UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP,
+                                       CommandList, IsInternal,
+                                       /*IsMultiDevice*/ false));
+  UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
+                         numEventsInWaitList, phEventWaitList,
+                         CommandList->second.ZeQueue));
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  ze_intel_external_semaphore_signal_exp_params_t SignalParams = {
+      ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_EXP, nullptr, 0};
+  SignalParams.value = hasValue ? signalValue : 0;
+  const ze_intel_external_semaphore_exp_handle_t hExtSemaphore =
+      reinterpret_cast<ze_intel_external_semaphore_exp_handle_t>(hSemaphore);
+
+  ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt
+                 .zexCommandListAppendSignalExternalSemaphoresExp,
+             (ZeCommandList, &hExtSemaphore, &SignalParams, 1, ZeEvent,
+              WaitList.Length, WaitList.ZeEventList));
+
+  return UR_RESULT_SUCCESS;
 }
 
 } // namespace ur::level_zero
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index 0237b62863..2bfc9302db 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -221,6 +221,7 @@ ur_result_t ur_platform_handle_t_::initialize() {
              (ZeDriver, &Count, ZeExtensions.data()));
 
   bool MutableCommandListSpecExtensionSupported = false;
+  bool ZeIntelExternalSemaphoreExtensionSupported = false;
   for (auto &extension : ZeExtensions) {
     // Check if global offset extension is available
     if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
@@ -252,6 +253,13 @@ ur_result_t ur_platform_handle_t_::initialize() {
         MutableCommandListSpecExtensionSupported = true;
       }
     }
+    // Check if extension is available for External Sempahores
+    if (strncmp(extension.name, ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME,
+                strlen(ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME) + 1) == 0) {
+      if (extension.version == ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_1_0) {
+        ZeIntelExternalSemaphoreExtensionSupported = true;
+      }
+    }
     zeDriverExtensionMap[extension.name] = extension.version;
   }
 
@@ -286,6 +294,38 @@ ur_result_t ur_platform_handle_t_::initialize() {
   // If yes, then set up L0 API pointers if the platform supports it.
   ZeUSMImport.setZeUSMImport(this);
 
+  if (ZeIntelExternalSemaphoreExtensionSupported) {
+    ZeExternalSemaphoreExt.Supported |=
+        (ZE_CALL_NOCHECK(
+             zeDriverGetExtensionFunctionAddress,
+             (ZeDriver, "zeIntelDeviceImportExternalSemaphoreExp",
+              reinterpret_cast<void **>(
+                  &ZeExternalSemaphoreExt.zexImportExternalSemaphoreExp))) ==
+         0);
+    ZeExternalSemaphoreExt.Supported |=
+        (ZE_CALL_NOCHECK(
+             zeDriverGetExtensionFunctionAddress,
+             (ZeDriver, "zeIntelCommandListAppendWaitExternalSemaphoresExp",
+              reinterpret_cast<void **>(
+                  &ZeExternalSemaphoreExt
+                       .zexCommandListAppendWaitExternalSemaphoresExp))) == 0);
+    ZeExternalSemaphoreExt.Supported |=
+        (ZE_CALL_NOCHECK(
+             zeDriverGetExtensionFunctionAddress,
+             (ZeDriver, "zeIntelCommandListAppendSignalExternalSemaphoresExp",
+              reinterpret_cast<void **>(
+                  &ZeExternalSemaphoreExt
+                       .zexCommandListAppendSignalExternalSemaphoresExp))) ==
+         0);
+    ZeExternalSemaphoreExt.Supported |=
+        (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
+                         (ZeDriver, "zeIntelDeviceReleaseExternalSemaphoreExp",
+                          reinterpret_cast<void **>(
+                              &ZeExternalSemaphoreExt
+                                   .zexDeviceReleaseExternalSemaphoreExp))) ==
+         0);
+  }
+
   // Check if mutable command list extension is supported and initialize
   // function pointers.
   if (MutableCommandListSpecExtensionSupported) {
diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp
index 468c602b10..4b613fb1e5 100644
--- a/source/adapters/level_zero/platform.hpp
+++ b/source/adapters/level_zero/platform.hpp
@@ -114,4 +114,24 @@ struct ur_platform_handle_t_ : public _ur_platform {
         ze_command_list_handle_t, const ze_mutable_command_id_exp_desc_t *,
         uint32_t, ze_kernel_handle_t *, uint64_t *) = nullptr;
   } ZeMutableCmdListExt;
-};
+
+  // Structure with function pointers for External Semaphore Extension.
+  struct ZeExternalSemaphoreExtension {
+    bool Supported = false;
+    ze_result_t (*zexImportExternalSemaphoreExp)(
+        ze_device_handle_t, ze_intel_external_semaphore_exp_handle_t *,
+        const ze_intel_external_semaphore_exp_desc_t *);
+    ze_result_t (*zexCommandListAppendWaitExternalSemaphoresExp)(
+        ze_command_list_handle_t,
+        const ze_intel_external_semaphore_exp_handle_t *,
+        const ze_intel_external_semaphore_wait_exp_params_t *, unsigned int,
+        ze_event_handle_t, uint32_t, ze_event_handle_t *);
+    ze_result_t (*zexCommandListAppendSignalExternalSemaphoresExp)(
+        ze_command_list_handle_t,
+        const ze_intel_external_semaphore_exp_handle_t *,
+        const ze_intel_external_semaphore_signal_exp_params_t *, size_t,
+        ze_event_handle_t, uint32_t, ze_event_handle_t *);
+    ze_result_t (*zexDeviceReleaseExternalSemaphoreExp)(
+        ze_intel_external_semaphore_exp_handle_t);
+  } ZeExternalSemaphoreExt;
+};
\ No newline at end of file

From 8c4366f13af4ee4b0a37169132cd79a91a3775cd Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 3 Dec 2024 15:31:43 +0000
Subject: [PATCH 055/148] Fix formatting

---
 include/ur_api.h                              |  6 ++--
 include/ur_api_funcs.def                      |  2 ++
 include/ur_print.hpp                          | 12 ++++----
 scripts/core/registry.yml                     | 12 ++++----
 .../level_zero/ur_interface_loader.cpp        | 17 +++++++++++
 .../level_zero/ur_interface_loader.hpp        | 24 ++++++++++++++++
 source/loader/layers/tracing/ur_trcddi.cpp    | 28 ++++++++++++-------
 source/loader/ur_ldrddi.cpp                   |  5 ++++
 8 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index bb1a1bed3f..68c5032460 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -215,7 +215,9 @@ typedef enum ur_function_t {
     UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228,                         ///< Enumerator for ::urEnqueueNativeCommandExp
     UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED = 229,                  ///< Enumerator for ::urLoaderConfigSetMockingEnabled
     UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP = 230,        ///< Enumerator for ::urBindlessImagesReleaseExternalMemoryExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 230,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP = 231,               ///< Enumerator for ::urCommandBufferAppendUSMMemcpyExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 231,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP = 232,                 ///< Enumerator for ::urCommandBufferAppendUSMFillExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP = 233,          ///< Enumerator for ::urCommandBufferAppendMemBufferCopyExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP = 234,         ///< Enumerator for ::urCommandBufferAppendMemBufferWriteExp
@@ -231,8 +233,6 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP = 244,              ///< Enumerator for ::urCommandBufferUpdateWaitEventsExp
     UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 245,     ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp
     UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT = 246,               ///< Enumerator for ::urEnqueueEventsWaitWithBarrierExt
-    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 247,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
-    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 248,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -10168,7 +10168,7 @@ urEnqueueNativeCommandExp(
 #endif
 // Intel 'oneAPI' Unified Runtime Experimental API for mapping tensor objects
 #if !defined(__GNUC__)
-#pragma region tensor map(experimental)
+#pragma region tensor_map_(experimental)
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of tensor map object
diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def
index 4920245369..5279534547 100644
--- a/include/ur_api_funcs.def
+++ b/include/ur_api_funcs.def
@@ -185,6 +185,8 @@ _UR_API(urCommandBufferUpdateSignalEventExp)
 _UR_API(urCommandBufferUpdateWaitEventsExp)
 _UR_API(urCommandBufferGetInfoExp)
 _UR_API(urCommandBufferCommandGetInfoExp)
+_UR_API(urTensorMapEncodeIm2ColExp)
+_UR_API(urTensorMapEncodeTiledExp)
 _UR_API(urUsmP2PEnablePeerAccessExp)
 _UR_API(urUsmP2PDisablePeerAccessExp)
 _UR_API(urUsmP2PPeerAccessGetInfoExp)
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 1acde66f4f..cd6bc2ffe0 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -940,9 +940,15 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP:
         os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP";
         break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
+        break;
     case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP";
         break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
+        break;
     case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP";
         break;
@@ -988,12 +994,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT:
         os << "UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT";
         break;
-    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
-        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
-        break;
-    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
-        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
-        break;
     default:
         os << "unknown enumerator";
         break;
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 059e23c2a0..6d7eaef77c 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -559,9 +559,15 @@ etors:
 - name: BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP
   desc: Enumerator for $xBindlessImagesReleaseExternalMemoryExp
   value: '230'
+- name: TENSOR_MAP_ENCODE_IM_2_COL_EXP
+  desc: Enumerator for $xTensorMapEncodeIm2ColExp
+  value: '230'
 - name: COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP
   desc: Enumerator for $xCommandBufferAppendUSMMemcpyExp
   value: '231'
+- name: TENSOR_MAP_ENCODE_TILED_EXP
+  desc: Enumerator for $xTensorMapEncodeTiledExp
+  value: '231'
 - name: COMMAND_BUFFER_APPEND_USM_FILL_EXP
   desc: Enumerator for $xCommandBufferAppendUSMFillExp
   value: '232'
@@ -607,12 +613,6 @@ etors:
 - name: ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT
   desc: Enumerator for $xEnqueueEventsWaitWithBarrierExt
   value: '246'
-- name: TENSOR_MAP_ENCODE_IM_2_COL_EXP
-  desc: Enumerator for $xTensorMapEncodeIm2ColExp
-  value: '230'
-- name: TENSOR_MAP_ENCODE_TILED_EXP
-  desc: Enumerator for $xTensorMapEncodeTiledExp
-  value: '231'
 ---
 type: enum
 desc: Defines structure types
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index 0a36b3ecad..1d9c8d5c37 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -423,6 +423,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
   return result;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
+    ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnEncodeIm2ColExp = ur::level_zero::urTensorMapEncodeIm2ColExp;
+  pDdiTable->pfnEncodeTiledExp = ur::level_zero::urTensorMapEncodeTiledExp;
+
+  return result;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL
 urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
   auto result = validateProcInputs(version, pDdiTable);
@@ -594,6 +607,10 @@ ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi) {
                                                      &ddi->Sampler);
   if (result != UR_RESULT_SUCCESS)
     return result;
+  result = ur::level_zero::urGetTensorMapExpProcAddrTable(
+      UR_API_VERSION_CURRENT, &ddi->TensorMapExp);
+  if (result != UR_RESULT_SUCCESS)
+    return result;
   result =
       ur::level_zero::urGetUSMProcAddrTable(UR_API_VERSION_CURRENT, &ddi->USM);
   if (result != UR_RESULT_SUCCESS)
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 1215d6449e..bebba18e6d 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -735,6 +735,30 @@ ur_result_t urEnqueueNativeCommandExp(
     const ur_exp_enqueue_native_command_properties_t *pProperties,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent);
+ur_result_t urTensorMapEncodeIm2ColExp(
+    ur_device_handle_t hDevice,
+    ur_exp_tensor_map_data_type_flags_t TensorMapType, uint32_t TensorRank,
+    void *GlobalAddress, const uint64_t *GlobalDim,
+    const uint64_t *GlobalStrides, const int *PixelBoxLowerCorner,
+    const int *PixelBoxUpperCorner, uint32_t ChannelsPerPixel,
+    uint32_t PixelsPerColumn, const uint32_t *ElementStrides,
+    ur_exp_tensor_map_interleave_flags_t Interleave,
+    ur_exp_tensor_map_swizzle_flags_t Swizzle,
+    ur_exp_tensor_map_l2_promotion_flags_t L2Promotion,
+    ur_exp_tensor_map_oob_fill_flags_t OobFill,
+    ur_exp_tensor_map_handle_t *hTensorMap);
+ur_result_t
+urTensorMapEncodeTiledExp(ur_device_handle_t hDevice,
+                          ur_exp_tensor_map_data_type_flags_t TensorMapType,
+                          uint32_t TensorRank, void *GlobalAddress,
+                          const uint64_t *GlobalDim,
+                          const uint64_t *GlobalStrides, const uint32_t *BoxDim,
+                          const uint32_t *ElementStrides,
+                          ur_exp_tensor_map_interleave_flags_t Interleave,
+                          ur_exp_tensor_map_swizzle_flags_t Swizzle,
+                          ur_exp_tensor_map_l2_promotion_flags_t L2Promotion,
+                          ur_exp_tensor_map_oob_fill_flags_t OobFill,
+                          ur_exp_tensor_map_handle_t *hTensorMap);
 #ifdef UR_STATIC_ADAPTER_LEVEL_ZERO
 ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi);
 #endif
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index b6be9b242f..5c3d67dbdc 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -9284,7 +9284,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
         getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP,
                                    "urTensorMapEncodeIm2ColExp", &params);
 
-    getContext()->logger.info("---> urTensorMapEncodeIm2ColExp");
+    auto &logger = getContext()->logger;
+    logger.info("   ---> urTensorMapEncodeIm2ColExp\n");
 
     ur_result_t result = pfnEncodeIm2ColExp(
         hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
@@ -9296,10 +9297,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp(
                              "urTensorMapEncodeIm2ColExp", &params, &result,
                              instance);
 
-    std::ostringstream args_str;
-    ur::extras::printFunctionParams(
-        args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, &params);
-    getContext()->logger.info("({}) -> {};\n", args_str.str(), result);
+    if (logger.getLevel() <= logger::Level::INFO) {
+        std::ostringstream args_str;
+        ur::extras::printFunctionParams(
+            args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, &params);
+        logger.info("   <--- urTensorMapEncodeIm2ColExp({}) -> {};\n",
+                    args_str.str(), result);
+    }
 
     return result;
 }
@@ -9354,7 +9358,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
         getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP,
                                    "urTensorMapEncodeTiledExp", &params);
 
-    getContext()->logger.info("---> urTensorMapEncodeTiledExp");
+    auto &logger = getContext()->logger;
+    logger.info("   ---> urTensorMapEncodeTiledExp\n");
 
     ur_result_t result = pfnEncodeTiledExp(
         hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim,
@@ -9365,10 +9370,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp(
                              "urTensorMapEncodeTiledExp", &params, &result,
                              instance);
 
-    std::ostringstream args_str;
-    ur::extras::printFunctionParams(
-        args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, &params);
-    getContext()->logger.info("({}) -> {};\n", args_str.str(), result);
+    if (logger.getLevel() <= logger::Level::INFO) {
+        std::ostringstream args_str;
+        ur::extras::printFunctionParams(
+            args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, &params);
+        logger.info("   <--- urTensorMapEncodeTiledExp({}) -> {};\n",
+                    args_str.str(), result);
+    }
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 2409738fbf..d152e63dc8 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -10693,6 +10693,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable(
 
     // Load the device-platform DDI tables
     for (auto &platform : ur_loader::getContext()->platforms) {
+        // statically linked adapter inside of the loader
+        if (platform.handle == nullptr) {
+            continue;
+        }
+
         if (platform.initStatus != UR_RESULT_SUCCESS) {
             continue;
         }

From 32cc0d9fb11427f402ff87d971f929e5e49f8f93 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 3 Dec 2024 15:44:55 +0000
Subject: [PATCH 056/148] Fix enum ordering for tensor map

---
 include/ur_api.h          |  4 ++--
 include/ur_print.hpp      | 12 ++++++------
 scripts/core/registry.yml | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 68c5032460..2f3d535610 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -215,9 +215,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228,                         ///< Enumerator for ::urEnqueueNativeCommandExp
     UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED = 229,                  ///< Enumerator for ::urLoaderConfigSetMockingEnabled
     UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP = 230,        ///< Enumerator for ::urBindlessImagesReleaseExternalMemoryExp
-    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 230,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP = 231,               ///< Enumerator for ::urCommandBufferAppendUSMMemcpyExp
-    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 231,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP = 232,                 ///< Enumerator for ::urCommandBufferAppendUSMFillExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP = 233,          ///< Enumerator for ::urCommandBufferAppendMemBufferCopyExp
     UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP = 234,         ///< Enumerator for ::urCommandBufferAppendMemBufferWriteExp
@@ -233,6 +231,8 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP = 244,              ///< Enumerator for ::urCommandBufferUpdateWaitEventsExp
     UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 245,     ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp
     UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT = 246,               ///< Enumerator for ::urEnqueueEventsWaitWithBarrierExt
+    UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 247,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
+    UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 248,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index cd6bc2ffe0..1acde66f4f 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -940,15 +940,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP:
         os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP";
         break;
-    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
-        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
-        break;
     case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP";
         break;
-    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
-        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
-        break;
     case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP";
         break;
@@ -994,6 +988,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT:
         os << "UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT";
         break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP";
+        break;
+    case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
+        os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 6d7eaef77c..f1a5d9199f 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -559,15 +559,9 @@ etors:
 - name: BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP
   desc: Enumerator for $xBindlessImagesReleaseExternalMemoryExp
   value: '230'
-- name: TENSOR_MAP_ENCODE_IM_2_COL_EXP
-  desc: Enumerator for $xTensorMapEncodeIm2ColExp
-  value: '230'
 - name: COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP
   desc: Enumerator for $xCommandBufferAppendUSMMemcpyExp
   value: '231'
-- name: TENSOR_MAP_ENCODE_TILED_EXP
-  desc: Enumerator for $xTensorMapEncodeTiledExp
-  value: '231'
 - name: COMMAND_BUFFER_APPEND_USM_FILL_EXP
   desc: Enumerator for $xCommandBufferAppendUSMFillExp
   value: '232'
@@ -613,6 +607,12 @@ etors:
 - name: ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT
   desc: Enumerator for $xEnqueueEventsWaitWithBarrierExt
   value: '246'
+- name: TENSOR_MAP_ENCODE_IM_2_COL_EXP
+  desc: Enumerator for $xTensorMapEncodeIm2ColExp
+  value: '247'
+- name: TENSOR_MAP_ENCODE_TILED_EXP
+  desc: Enumerator for $xTensorMapEncodeTiledExp
+  value: '248'
 ---
 type: enum
 desc: Defines structure types

From 41ad797c399368a407ad236e75971244a00b6acc Mon Sep 17 00:00:00 2001
From: Victor Lomuller <victor@codeplay.com>
Date: Wed, 13 Nov 2024 15:50:28 +0000
Subject: [PATCH 057/148] Add new launch property to support
 work_group_scratch_memory

intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used.

In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp.

The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension.

Signed-off-by: Victor Lomuller <victor@codeplay.com>
---
 include/ur_api.h                              |  15 ++-
 include/ur_ddi.h                              |   1 +
 include/ur_print.hpp                          |  16 +++
 scripts/core/exp-launch-properties.yml        |  13 +-
 source/adapters/cuda/enqueue.cpp              | 111 +++++++++++-------
 source/adapters/cuda/kernel.hpp               |  53 ++++++---
 source/adapters/level_zero/queue.cpp          |   5 +-
 .../level_zero/ur_interface_loader.hpp        |   4 +-
 source/adapters/level_zero/v2/queue_api.cpp   |   6 +-
 source/adapters/level_zero/v2/queue_api.hpp   |   6 +-
 .../v2/queue_immediate_in_order.cpp           |   4 +-
 .../v2/queue_immediate_in_order.hpp           |   4 +-
 source/adapters/mock/ur_mockddi.cpp           |  23 +++-
 source/loader/layers/tracing/ur_trcddi.cpp    |  25 ++--
 source/loader/layers/validation/ur_valddi.cpp |  13 +-
 source/loader/ur_ldrddi.cpp                   |  35 +++++-
 source/loader/ur_libapi.cpp                   |  12 +-
 source/ur_api.cpp                             |   4 +
 .../launch_properties.cpp                     |   4 +-
 19 files changed, 251 insertions(+), 103 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..1de876cb7f 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9560,6 +9560,7 @@ typedef enum ur_exp_launch_property_id_t {
     UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0,            ///< The property has no effect
     UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1,       ///< Whether to launch a cooperative kernel
     UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
+    UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
     /// @cond
     UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -9573,10 +9574,12 @@ typedef enum ur_exp_launch_property_id_t {
 ///   _Analogues_
 ///     - **CUlaunchAttributeValue**
 typedef union ur_exp_launch_property_value_t {
-    uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
-                            ///< value must be a divisor of the corresponding global work-size
-                            ///< dimension (in units of work-group).
-    int cooperative;        ///< [in] non-zero value indicates a cooperative kernel
+    uint32_t clusterDim[3];    ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
+                               ///< value must be a divisor of the corresponding global work-size
+                               ///< dimension (in units of work-group).
+    int cooperative;           ///< [in] non-zero value indicates a cooperative kernel
+    size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
+                               ///< allocate in bytes
 
 } ur_exp_launch_property_value_t;
 
@@ -9617,6 +9620,7 @@ typedef struct ur_exp_launch_property_t {
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9645,6 +9649,8 @@ urEnqueueKernelLaunchCustomExp(
     ur_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
     uint32_t workDim,                               ///< [in] number of dimensions, from 1 to 3, to specify the global and
                                                     ///< work-group work-items
+    const size_t *pGlobalWorkOffset,                ///< [in] pointer to an array of workDim unsigned values that specify the
+                                                    ///< offset used to calculate the global ID of a work-item
     const size_t *pGlobalWorkSize,                  ///< [in] pointer to an array of workDim unsigned values that specify the
                                                     ///< number of global work-items in workDim that will execute the kernel
                                                     ///< function
@@ -11554,6 +11560,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
     ur_queue_handle_t *phQueue;
     ur_kernel_handle_t *phKernel;
     uint32_t *pworkDim;
+    const size_t **ppGlobalWorkOffset;
     const size_t **ppGlobalWorkSize;
     const size_t **ppLocalWorkSize;
     uint32_t *pnumPropsInLaunchPropList;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 40a6c5c269..cdf90eda6d 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
     uint32_t,
     const size_t *,
     const size_t *,
+    const size_t *,
     uint32_t,
     const ur_exp_launch_property_t *,
     uint32_t,
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..190d3f9cd5 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -10397,6 +10397,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
         os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
         break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+        os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -10433,6 +10436,13 @@ inline ur_result_t printUnion(
 
         os << (params.cooperative);
 
+        break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+
+        os << ".workgroup_mem_size = ";
+
+        os << (params.workgroup_mem_size);
+
         break;
     default:
         os << "<unknown>";
@@ -15100,6 +15110,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 
     os << *(params->pworkDim);
 
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
     os << ", ";
     os << ".pGlobalWorkSize = ";
 
diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
index 9e66e9ea06..ca28421815 100644
--- a/scripts/core/exp-launch-properties.yml
+++ b/scripts/core/exp-launch-properties.yml
@@ -29,6 +29,8 @@ etors:
       desc: "Whether to launch a cooperative kernel"
     - name: CLUSTER_DIMENSION
       desc: "work-group cluster dimensions"
+    - name: WORK_GROUP_MEMORY
+      desc: "Implicit work group memory allocation"
 --- #--------------------------------------------------------------------------
 type: union
 desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
       name: cooperative
       desc: "[in] non-zero value indicates a cooperative kernel"
       tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
+    - type: size_t
+      name: workgroup_mem_size
+      desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
+      tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
     - type: const size_t*
       name: pGlobalWorkSize
       desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
     - type: uint32_t
       name: numEventsInWaitList
       desc: "[in] size of the event wait list"
-    - type: const ur_event_handle_t*
+    - type: const $x_event_handle_t*
       name: phEventWaitList
       desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
-    - type: ur_event_handle_t*
+    - type: $x_event_handle_t*
       name: phEvent
       desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
 returns:
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 54a0f778fb..2a4a2cf54f 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
                                         phEventWaitList, phEvent);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+static ur_result_t
+enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                    uint32_t workDim, const size_t *pGlobalWorkOffset,
+                    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+                    uint32_t numEventsInWaitList,
+                    const ur_event_handle_t *phEventWaitList,
+                    ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
@@ -503,6 +508,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                             pGlobalWorkSize, pLocalWorkSize,
+                             numEventsInWaitList, phEventWaitList, phEvent,
+                             /*WorkGroupMemory=*/0);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -513,8 +529,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
     coop_prop.value.cooperative = 1;
     return urEnqueueKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
-        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
+        phEvent);
   }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
@@ -523,16 +540,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
 
-  if (numPropsInLaunchPropList == 0) {
-    urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
-                          pLocalWorkSize, numEventsInWaitList, phEventWaitList,
-                          phEvent);
+  size_t WorkGroupMemory = [&]() -> size_t {
+    const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
+        launchPropList, launchPropList + numPropsInLaunchPropList,
+        [](const ur_exp_launch_property_t &Prop) {
+          return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
+        });
+    if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
+      return WorkGroupMemoryProp->value.workgroup_mem_size;
+    return 0;
+  }();
+
+  if (numPropsInLaunchPropList == 0 ||
+      (WorkGroupMemory && numPropsInLaunchPropList == 1)) {
+    return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent,
+                               WorkGroupMemory);
   }
 #if CUDA_VERSION >= 11080
   // Preconditions
@@ -545,7 +575,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
 
-  std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+  std::vector<CUlaunchAttribute> launch_attribute;
+  launch_attribute.reserve(numPropsInLaunchPropList);
 
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
@@ -558,40 +589,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
       if (workDim == 3) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[2];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
       } else if (workDim == 2) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       } else {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       }
 
       UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -600,9 +626,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
-      launch_attribute[i].value.cooperative =
-          launchPropList[i].value.cooperative;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+      attr.value.cooperative = launchPropList[i].value.cooperative;
+      break;
+    }
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
       break;
     }
     default: {
@@ -615,8 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
           setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                          nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
-                          CuFunc, ThreadsPerBlock, BlocksPerGrid);
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
       Ret != UR_RESULT_SUCCESS)
     return Ret;
 
@@ -664,7 +693,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
     launch_config.attrs = &launch_attribute[0];
-    launch_config.numAttrs = numPropsInLaunchPropList;
+    launch_config.numAttrs = launch_attribute.size();
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 2b04dfba43..d1b3b61244 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -76,6 +76,7 @@ struct ur_kernel_handle_t_ {
     /// padded to appropriate alignment. Zero if the argument at the index
     /// isn't a local memory argument.
     args_size_t OriginalLocalMemSize;
+    size_t WorkGroupMemory = 0;
 
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
@@ -134,9 +135,10 @@ struct ur_kernel_handle_t_ {
       OriginalLocalMemSize[Index] = Size;
 
       // Calculate the current starting offset into local data
-      const size_t LocalOffset = std::accumulate(
-          std::begin(AlignedLocalMemSize),
-          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
+      const size_t LocalOffset =
+          std::accumulate(std::begin(AlignedLocalMemSize),
+                          std::next(std::begin(AlignedLocalMemSize), Index),
+                          size_t{WorkGroupMemory});
 
       // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
@@ -156,20 +158,11 @@ struct ur_kernel_handle_t_ {
       return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      // Get the aligned argument size and offset into local data
-      auto [AlignedLocalSize, AlignedLocalOffset] =
-          calcAlignedLocalArgument(Index, Size);
-
-      // Store argument details
-      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-             AlignedLocalSize);
-
-      // For every existing local argument which follows at later argument
-      // indices, update the offset and pointer into the kernel local memory.
-      // Required as padding will need to be recalculated.
+    // Iterate over all existing local argument which follows StartIndex
+    // index, update the offset and pointer into the kernel local memory.
+    void updateLocalArgOffset(size_t StartIndex) {
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
-      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+      for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) {
         const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
         if (OriginalLocalSize == 0) {
           // Skip if successor argument isn't a local memory arg
@@ -192,6 +185,20 @@ struct ur_kernel_handle_t_ {
       }
     }
 
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      auto [AlignedLocalSize, AlignedLocalOffset] =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             AlignedLocalSize);
+      // For every existing local argument which follows at later argument
+      // indices, update the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      updateLocalArgOffset(Index + 1);
+    }
+
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
       assert(hMem && "Invalid mem handle");
       // To avoid redundancy we are not storing mem obj with index i at index
@@ -206,6 +213,16 @@ struct ur_kernel_handle_t_ {
       MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
     }
 
+    void setWorkGroupMemory(size_t MemSize) {
+      // If the WorkGroupMemory is the same as MemSize, then all accessors
+      // offsets accounted for this extra memory
+      if (WorkGroupMemory == MemSize)
+        return;
+      WorkGroupMemory = MemSize;
+      // Update local accessor offsets
+      updateLocalArgOffset(/*StartIndex=*/0);
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -215,7 +232,8 @@ struct ur_kernel_handle_t_ {
 
     uint32_t getLocalSize() const {
       return std::accumulate(std::begin(AlignedLocalMemSize),
-                             std::end(AlignedLocalMemSize), 0);
+                             std::end(AlignedLocalMemSize), 0) +
+             WorkGroupMemory;
     }
   } Args;
 
@@ -300,6 +318,7 @@ struct ur_kernel_handle_t_ {
     return Args.getIndices();
   }
 
+  void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); }
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
   size_t getRegsPerThread() const noexcept { return RegsPerThread; };
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c4598f3472..95c8d026a7 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -902,14 +902,15 @@ ur_result_t urQueueFlush(
 
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hQueue;
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 1215d6449e..0832303b50 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent);
diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp
index b7b45625a2..e4659b5f2c 100644
--- a/source/adapters/level_zero/v2/queue_api.cpp
+++ b/source/adapters/level_zero/v2/queue_api.cpp
@@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp(
 }
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) try {
   return hQueue->enqueueKernelLaunchCustomExp(
-      hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
+      hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
       numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
       phEventWaitList, phEvent);
 } catch (...) {
diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp
index 7cb039ccdd..c59f084fc4 100644
--- a/source/adapters/level_zero/v2/queue_api.hpp
+++ b/source/adapters/level_zero/v2/queue_api.hpp
@@ -144,9 +144,9 @@ struct ur_queue_handle_t_ {
                                                    const ur_event_handle_t *,
                                                    ur_event_handle_t *) = 0;
   virtual ur_result_t enqueueKernelLaunchCustomExp(
-      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t,
-      const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *,
-      ur_event_handle_t *) = 0;
+      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *,
+      const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t,
+      const ur_event_handle_t *, ur_event_handle_t *) = 0;
   virtual ur_result_t
   enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *,
                                   uint32_t, const ur_event_handle_t *,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 519b0ffc1e..05e48c8740 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -1069,13 +1069,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index 33e060ded3..bdd3009d63 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
                                ur_event_handle_t *phEvent) override;
   ur_result_t enqueueKernelLaunchCustomExp(
       ur_kernel_handle_t hKernel, uint32_t workDim,
-      const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-      uint32_t numPropsInLaunchPropList,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
       const ur_exp_launch_property_t *launchPropList,
       uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
       ur_event_handle_t *phEvent) override;
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 42c342444d..c8ce408756 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10126,6 +10126,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -10153,11 +10156,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
@@ -10176,6 +10185,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         result = replaceCallback(&params);
     } else {
 
+        // optional output handle
+        if (phEvent) {
+            *phEvent = mock::createDummyHandle<ur_event_handle_t>();
+        }
         result = UR_RESULT_SUCCESS;
     }
 
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 64489c39ac..afd1411ae8 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8698,6 +8698,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8730,11 +8733,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
     uint64_t instance =
         getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                                    "urEnqueueKernelLaunchCustomExp", &params);
@@ -8743,9 +8752,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     logger.info("   ---> urEnqueueKernelLaunchCustomExp\n");
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                              "urEnqueueKernelLaunchCustomExp", &params, &result,
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..c2dcc7be6f 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9726,6 +9726,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9766,6 +9769,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
+        if (NULL == pGlobalWorkOffset) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
         if (NULL == pGlobalWorkSize) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9794,9 +9801,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 86a6ad95a0..602b8f1a82 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8866,6 +8866,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8908,11 +8911,35 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     // convert loader handle to platform handle
     hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
 
+    // convert loader handles to platform handles
+    auto phEventWaitListLocal =
+        std::vector<ur_event_handle_t>(numEventsInWaitList);
+    for (size_t i = 0; i < numEventsInWaitList; ++i) {
+        phEventWaitListLocal[i] =
+            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
+    }
+
     // forward to device-platform
-    result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                      pLocalWorkSize, numPropsInLaunchPropList,
-                                      launchPropList, numEventsInWaitList,
-                                      phEventWaitList, phEvent);
+    result = pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitListLocal.data(), phEvent);
+
+    // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below.
+    if (UR_RESULT_SUCCESS != result &&
+        UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) {
+        return result;
+    }
+    try {
+        // convert platform handle to loader handle
+        if (nullptr != phEvent) {
+            *phEvent = reinterpret_cast<ur_event_handle_t>(
+                context->factories.ur_event_factory.getInstance(*phEvent,
+                                                                dditable));
+        }
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
 
     return result;
 }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..b2e26a8b8b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8992,6 +8992,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9020,6 +9021,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9050,10 +9054,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                    pLocalWorkSize, numPropsInLaunchPropList,
-                                    launchPropList, numEventsInWaitList,
-                                    phEventWaitList, phEvent);
+    return pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..0b2e6a0f74 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7625,6 +7625,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -7653,6 +7654,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index a54a44ecaf..23ba56ff4b 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
     AddPodArg(val);
 
     ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp(
-        queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0,
-        nullptr, nullptr));
+        queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1,
+        &props[0], 0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, sizeof(val) * global_size, val);
 }

From f08d42c10ce095a96cccd0fd35f16ac264d566e1 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 19 Nov 2024 10:00:44 -0800
Subject: [PATCH 058/148] [L0] Interrupt-based event implementation

To expose this functionality in UR, we want two ways of enabling low-power events:

Queue-wide enabling so all events created on the queue are low-powered.
As a property passed to urEnqueueEventsWaitWithBarrier making the resulting event a low-power event. This will require the existing interface to be extended with properties, potentially through a new experimental function.

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/command_buffer.cpp |  2 +-
 source/adapters/level_zero/context.cpp        | 11 ++++++-
 source/adapters/level_zero/context.hpp        | 21 ++++++++++++-
 source/adapters/level_zero/device.cpp         |  2 ++
 source/adapters/level_zero/event.cpp          | 16 ++++++----
 source/adapters/level_zero/event.hpp          |  5 +++-
 source/adapters/level_zero/queue.cpp          | 30 ++++++++++++-------
 source/adapters/level_zero/queue.hpp          |  7 ++++-
 8 files changed, 72 insertions(+), 22 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index eccdc5e4d2..14f2db4d84 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -215,7 +215,7 @@ ur_result_t createSyncPointAndGetZeEvents(
   UR_CALL(EventCreate(CommandBuffer->Context, nullptr /*Queue*/,
                       false /*IsMultiDevice*/, HostVisible, &LaunchEvent,
                       false /*CounterBasedEventEnabled*/,
-                      !CommandBuffer->IsProfilingEnabled));
+                      !CommandBuffer->IsProfilingEnabled, false));
   LaunchEvent->CommandType = CommandType;
   ZeLaunchEvent = LaunchEvent->ZeEvent;
 
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 4fd1db0933..fe0f679a7a 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -478,7 +478,8 @@ static const uint32_t MaxNumEventsPerPool = [] {
 ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
     ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
     bool ProfilingEnabled, ur_device_handle_t Device,
-    bool CounterBasedEventEnabled, bool UsingImmCmdList) {
+    bool CounterBasedEventEnabled, bool UsingImmCmdList,
+    bool InterruptBasedEventEnabled) {
   // Lock while updating event pool machinery.
   std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
 
@@ -537,6 +538,14 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
                     counterBasedExt.flags);
       ZeEventPoolDesc.pNext = &counterBasedExt;
     }
+    if (InterruptBasedEventEnabled) {
+      ze_intel_event_sync_mode_exp_desc_t eventSyncMode = {
+          ZE_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC, nullptr, 0};
+      eventSyncMode.syncModeFlags =
+          ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT |
+          ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT;
+      ZeEventPoolDesc.pNext = &eventSyncMode;
+    }
 
     std::vector<ze_device_handle_t> ZeDevices;
     if (ZeDevice) {
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 470c4c4f35..05e3369274 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -33,6 +33,24 @@ struct l0_command_list_cache_info {
   bool IsImmediate = false;
 };
 
+typedef uint32_t ze_intel_event_sync_mode_exp_flags_t;
+typedef enum _ze_intel_event_sync_mode_exp_flag_t {
+  ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT = ZE_BIT(0),
+  ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT = ZE_BIT(1),
+  ZE_INTEL_EVENT_SYNC_MODE_EXP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_event_sync_mode_exp_flag_t;
+
+#define ZE_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC                       \
+  (ze_structure_type_t)0x00030016
+
+typedef struct _ze_intel_event_sync_mode_exp_desc_t {
+  ze_structure_type_t stype;
+  const void *pNext;
+
+  ze_intel_event_sync_mode_exp_flags_t syncModeFlags;
+} ze_intel_event_sync_mode_exp_desc_t;
+
 struct ur_context_handle_t_ : _ur_object {
   ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
                        const ur_device_handle_t *Devs, bool OwnZeContext)
@@ -199,7 +217,8 @@ struct ur_context_handle_t_ : _ur_object {
                                              bool ProfilingEnabled,
                                              ur_device_handle_t Device,
                                              bool CounterBasedEventEnabled,
-                                             bool UsingImmCmdList);
+                                             bool UsingImmCmdList,
+                                             bool InterruptBasedEventEnabled);
 
   // Get ur_event_handle_t from cache.
   ur_event_handle_t getEventFromContextCache(bool HostVisible,
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 99bb20d31a..dd0dafc8aa 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -485,6 +485,8 @@ ur_result_t urDeviceGetInfo(
   case UR_DEVICE_INFO_BUILT_IN_KERNELS:
     // TODO: To find out correct value
     return ReturnValue("");
+  case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP:
+    return ReturnValue(UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP);
   case UR_DEVICE_INFO_QUEUE_PROPERTIES:
     return ReturnValue(
         ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 96da4be0fd..028a791bb7 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -422,7 +422,8 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
 ur_result_t urEnqueueEventsWaitWithBarrierExt(
     ur_queue_handle_t Queue, ///< [in] handle of the queue object
     const ur_exp_enqueue_ext_properties_t
-        *, ///< [in][optional] pointer to the extended enqueue properties
+        *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
+                         ///< properties
     uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
     const ur_event_handle_t
         *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
@@ -913,7 +914,7 @@ ur_result_t urExtEventCreate(
   UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                       true /*HostVisible*/, Event,
                       false /*CounterBasedEventEnabled*/,
-                      false /*ForceDisableProfiling*/));
+                      false /*ForceDisableProfiling*/, false));
 
   (*Event)->RefCountExternal++;
   if (!(*Event)->CounterBasedEventsEnabled)
@@ -935,7 +936,7 @@ ur_result_t urEventCreateWithNativeHandle(
     UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                         true /*HostVisible*/, Event,
                         false /*CounterBasedEventEnabled*/,
-                        false /*ForceDisableProfiling*/));
+                        false /*ForceDisableProfiling*/, false));
 
     (*Event)->RefCountExternal++;
     if (!(*Event)->CounterBasedEventsEnabled)
@@ -1293,7 +1294,8 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
                         bool IsMultiDevice, bool HostVisible,
                         ur_event_handle_t *RetEvent,
                         bool CounterBasedEventEnabled,
-                        bool ForceDisableProfiling) {
+                        bool ForceDisableProfiling,
+                        bool InterruptBasedEventEnabled) {
   bool ProfilingEnabled =
       ForceDisableProfiling ? false : (!Queue || Queue->isProfilingEnabled());
   bool UsingImmediateCommandlists = !Queue || Queue->UsingImmCmdLists;
@@ -1317,14 +1319,15 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
 
   if (auto Res = Context->getFreeSlotInExistingOrNewPool(
           ZeEventPool, Index, HostVisible, ProfilingEnabled, Device,
-          CounterBasedEventEnabled, UsingImmediateCommandlists))
+          CounterBasedEventEnabled, UsingImmediateCommandlists,
+          Queue->interruptBasedEventsEnabled()))
     return Res;
 
   ZeStruct<ze_event_desc_t> ZeEventDesc;
   ZeEventDesc.index = Index;
   ZeEventDesc.wait = 0;
 
-  if (HostVisible || CounterBasedEventEnabled) {
+  if (HostVisible || CounterBasedEventEnabled || InterruptBasedEventEnabled) {
     ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
   } else {
     //
@@ -1350,6 +1353,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
     return UR_RESULT_ERROR_UNKNOWN;
   }
   (*RetEvent)->CounterBasedEventsEnabled = CounterBasedEventEnabled;
+  (*RetEvent)->InterruptBasedEventsEnabled = InterruptBasedEventEnabled;
   if (HostVisible)
     (*RetEvent)->HostVisibleEvent =
         reinterpret_cast<ur_event_handle_t>(*RetEvent);
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index 2c9e698e3c..de018e7060 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -33,7 +33,8 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
                         bool IsMultiDevice, bool HostVisible,
                         ur_event_handle_t *RetEvent,
                         bool CounterBasedEventEnabled,
-                        bool ForceDisableProfiling);
+                        bool ForceDisableProfiling,
+                        bool InterruptBasedEventEnabled);
 } // extern "C"
 
 // This is an experimental option that allows to disable caching of events in
@@ -251,6 +252,8 @@ struct ur_event_handle_t_ : _ur_object {
   std::optional<ur_completion_batch_it> completionBatch;
   // Keeps track of whether we are using Counter-based Events.
   bool CounterBasedEventsEnabled = false;
+  // Keeps track of whether we are using Interrupt-based Events.
+  bool InterruptBasedEventsEnabled = false;
 };
 
 // Helper function to implement zeHostSynchronize.
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c4598f3472..e493dcc60a 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -104,10 +104,10 @@ ur_result_t ur_completion_batch::seal(ur_queue_handle_t queue,
   assert(st == ACCUMULATING);
 
   if (!barrierEvent) {
-    UR_CALL(EventCreate(queue->Context, queue, false /*IsMultiDevice*/,
-                        true /*HostVisible*/, &barrierEvent,
-                        false /*CounterBasedEventEnabled*/,
-                        false /*ForceDisableProfiling*/));
+    UR_CALL(EventCreate(
+        queue->Context, queue, false /*IsMultiDevice*/, true /*HostVisible*/,
+        &barrierEvent, false /*CounterBasedEventEnabled*/,
+        false /*ForceDisableProfiling*/, false /*InterruptBasedEventEnabled*/));
   }
 
   // Instead of collecting all the batched events, we simply issue a global
@@ -1494,6 +1494,11 @@ bool ur_queue_handle_t_::doReuseDiscardedEvents() {
   return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
 }
 
+bool ur_queue_handle_t_::interruptBasedEventsEnabled() {
+  return isInOrderQueue() && Device->useDriverInOrderLists() &&
+         isLowPowerEvents();
+}
+
 ur_result_t
 ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) {
   if (LastCommandEvent && LastCommandEvent->IsDiscarded) {
@@ -1654,6 +1659,10 @@ bool ur_queue_handle_t_::isInOrderQueue() const {
           0);
 }
 
+bool ur_queue_handle_t_::isLowPowerEvents() const {
+  return ((this->Properties & UR_QUEUE_FLAG_LOW_POWER_EVENTS_EXP) != 0);
+}
+
 // Helper function to perform the necessary cleanup of the events from reset cmd
 // list.
 ur_result_t CleanupEventListFromResetCmdList(
@@ -1868,12 +1877,10 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
 //        visible pool.
 // \param HostVisible tells if the event must be created in the
 //        host-visible pool. If not set then this function will decide.
-ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue,
-                                         ur_event_handle_t *Event,
-                                         ur_command_t CommandType,
-                                         ur_command_list_ptr_t CommandList,
-                                         bool IsInternal, bool IsMultiDevice,
-                                         std::optional<bool> HostVisible) {
+ur_result_t createEventAndAssociateQueue(
+    ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
+    ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
+    std::optional<bool> HostVisible, std::optional<bool> InterruptBasedEvents) {
 
   if (!HostVisible.has_value()) {
     // Internal/discarded events do not need host-scope visibility.
@@ -1888,7 +1895,8 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue,
   if (*Event == nullptr)
     UR_CALL(EventCreate(
         Queue->Context, Queue, IsMultiDevice, HostVisible.value(), Event,
-        Queue->CounterBasedEventsEnabled, false /*ForceDisableProfiling*/));
+        Queue->CounterBasedEventsEnabled, false /*ForceDisableProfiling*/,
+        HostVisible.has_value() ? true : Queue->interruptBasedEventsEnabled()));
 
   (*Event)->UrQueue = Queue;
   (*Event)->CommandType = CommandType;
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 1108e4c268..786f1bdd51 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -533,6 +533,8 @@ struct ur_queue_handle_t_ : _ur_object {
   // queue.
   bool doReuseDiscardedEvents();
 
+  bool interruptBasedEventsEnabled();
+
   // Append command to provided command list to wait and reset the last event if
   // it is discarded and create new ur_event_handle_t wrapper using the same
   // native event and put it to the cache. We call this method after each
@@ -557,6 +559,9 @@ struct ur_queue_handle_t_ : _ur_object {
   // Returns true if the queue has discard events property.
   bool isDiscardEvents() const;
 
+  // Returns true if the queue has low power events property.
+  bool isLowPowerEvents() const;
+
   // Returns true if the queue has explicit priority set by user.
   bool isPriorityLow() const;
   bool isPriorityHigh() const;
@@ -708,7 +713,7 @@ struct ur_queue_handle_t_ : _ur_object {
 ur_result_t createEventAndAssociateQueue(
     ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
     ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
-    std::optional<bool> HostVisible = std::nullopt);
+    std::optional<bool> HostVisible = std::nullopt, std::optional<bool> InterruptBasedEvents = std::nullopt);
 
 // This helper function checks to see if an event for a command can be included
 // at the end of a command list batch. This will only be true if the event does

From d0f66948f6c9bdb44b914efd08b67f105cdbf7c6 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Wed, 20 Nov 2024 17:44:04 -0800
Subject: [PATCH 059/148] [L0] Fix urEnqueueEventsWaitWithBarrier option1

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 295 ++++++++++++++++++++++++++-
 source/adapters/level_zero/queue.cpp |   4 +-
 2 files changed, 296 insertions(+), 3 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 028a791bb7..4d7991b1a6 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -435,9 +435,300 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
 ) {
-  return ur::level_zero::urEnqueueEventsWaitWithBarrier(
-      Queue, NumEventsInWaitList, EventWaitList, OutEvent);
+  bool InterruptBased =
+      EnqueueExtProp &&
+      (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
+  if (!InterruptBased) {
+    return ur::level_zero::urEnqueueEventsWaitWithBarrier(
+        Queue, NumEventsInWaitList, EventWaitList, OutEvent);
+  }
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Helper function for appending a barrier to a command list.
+  auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
+                                           _ur_ze_event_list_t &EventWaitList,
+                                           ur_event_handle_t &Event,
+                                           bool IsInternal) {
+    UR_CALL(createEventAndAssociateQueue(
+        Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, IsInternal,
+        false, std::nullopt, true));
+    Event->WaitList = EventWaitList;
+
+    // For in-order queue we don't need a real barrier, just wait for
+    // requested events in potentially different queues and add a "barrier"
+    // event signal because it is already guaranteed that previous commands
+    // in this queue are completed when the signal is started.
+    //
+    // Only consideration here is that when profiling is used, signalEvent
+    // cannot be used if EventWaitList.Length == 0. In those cases, we need
+    // to fallback directly to barrier to have correct timestamps. See here:
+    // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
+    //
+    // TODO: this and other special handling of in-order queues to be
+    // updated when/if Level Zero adds native support for in-order queues.
+    //
+    if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
+        !Queue->isProfilingEnabled()) {
+      if (EventWaitList.Length) {
+        if (CmdList->second.IsInOrderList) {
+          for (unsigned i = EventWaitList.Length; i-- > 0;) {
+            // If the event is a multidevice event, then given driver in order
+            // lists, we cannot include this into the wait event list due to
+            // driver limitations.
+            if (EventWaitList.UrEventList[i]->IsMultiDevice) {
+              EventWaitList.Length--;
+              if (EventWaitList.Length != i) {
+                std::swap(EventWaitList.UrEventList[i],
+                          EventWaitList.UrEventList[EventWaitList.Length]);
+                std::swap(EventWaitList.ZeEventList[i],
+                          EventWaitList.ZeEventList[EventWaitList.Length]);
+              }
+            }
+          }
+        }
+        ZE2UR_CALL(
+            zeCommandListAppendWaitOnEvents,
+            (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
+      }
+      ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                 (CmdList->first, Event->ZeEvent));
+    } else {
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+                 (CmdList->first, Event->ZeEvent, EventWaitList.Length,
+                  EventWaitList.ZeEventList));
+    }
+
+    return UR_RESULT_SUCCESS;
+  };
+
+  // If the queue is in-order then each command in it effectively acts as a
+  // barrier, so we don't need to do anything except if we were requested
+  // a "barrier" event to be created. Or if we need to wait for events in
+  // potentially different queues.
+  //
+  if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  ur_event_handle_t ResultEvent = nullptr;
+  bool IsInternal = OutEvent == nullptr;
+  // For in-order queue and wait-list which is empty or has events from
+  // the same queue just use the last command event as the barrier event.
+  // This optimization is disabled when profiling is enabled to ensure
+  // accurate profiling values & the overhead that profiling incurs.
+  if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() &&
+      WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList,
+                                            EventWaitList) &&
+      Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
+    UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
+    ResultEvent = Queue->LastCommandEvent;
+    if (OutEvent) {
+      *OutEvent = ResultEvent;
+    }
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Indicator for whether batching is allowed. This may be changed later in
+  // this function, but allow it by default.
+  bool OkToBatch = true;
+
+  // If we have a list of events to make the barrier from, then we can create a
+  // barrier on these and use the resulting event as our future barrier.
+  // We use the same approach if
+  // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
+  // positive value.
+  // We use the same approach if we have in-order queue because every command
+  // depends on previous one, so we don't need to insert barrier to multiple
+  // command lists.
+  if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
+      Queue->isInOrderQueue()) {
+    // Retain the events as they will be owned by the result event.
+    _ur_ze_event_list_t TmpWaitList;
+    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+        NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
+
+    // Get an arbitrary command-list in the queue.
+    ur_command_list_ptr_t CmdList;
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
+
+    // Insert the barrier into the command-list and execute.
+    UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
+                                     IsInternal));
+
+    UR_CALL(
+        Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
+
+    // Because of the dependency between commands in the in-order queue we don't
+    // need to keep track of any active barriers if we have in-order queue.
+    if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
+      auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
+      Queue->ActiveBarriers.add(UREvent);
+    }
+
+    if (OutEvent) {
+      *OutEvent = ResultEvent;
+    }
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Since there are no events to explicitly create a barrier for, we are
+  // inserting a queue-wide barrier.
+
+  // Command list(s) for putting barriers.
+  std::vector<ur_command_list_ptr_t> CmdLists;
+
+  // There must be at least one L0 queue.
+  auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
+  auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
+  UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
+            UR_RESULT_ERROR_INVALID_QUEUE);
+
+  size_t NumQueues = 0;
+  for (auto &QueueMap :
+       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
+    for (auto &QueueGroup : QueueMap)
+      NumQueues += QueueGroup.second.ZeQueues.size();
+
+  OkToBatch = true;
+  // Get an available command list tied to each command queue. We need
+  // these so a queue-wide barrier can be inserted into each command
+  // queue.
+  CmdLists.reserve(NumQueues);
+  for (auto &QueueMap :
+       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
+    for (auto &QueueGroup : QueueMap) {
+      bool UseCopyEngine =
+          QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute;
+      if (Queue->UsingImmCmdLists) {
+        // If immediate command lists are being used, each will act as their own
+        // queue, so we must insert a barrier into each.
+        for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
+          if (ImmCmdList != Queue->CommandListMap.end())
+            CmdLists.push_back(ImmCmdList);
+      } else {
+        for (auto ZeQueue : QueueGroup.second.ZeQueues) {
+          if (ZeQueue) {
+            ur_command_list_ptr_t CmdList;
+            UR_CALL(Queue->Context->getAvailableCommandList(
+                Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
+                EventWaitList, OkToBatch, &ZeQueue));
+            CmdLists.push_back(CmdList);
+          }
+        }
+      }
+    }
+
+  // If no activity has occurred on the queue then there will be no cmdlists.
+  // We need one for generating an Event, so create one.
+  if (CmdLists.size() == 0) {
+    // Get any available command list.
+    ur_command_list_ptr_t CmdList;
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
+    CmdLists.push_back(CmdList);
+  }
+
+  if (CmdLists.size() > 1) {
+    // Insert a barrier into each unique command queue using the available
+    // command-lists.
+    std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
+    for (size_t I = 0; I < CmdLists.size(); ++I) {
+      _ur_ze_event_list_t waitlist;
+      UR_CALL(insertBarrierIntoCmdList(
+          CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
+    }
+    // If there were multiple queues we need to create a "convergence" event to
+    // be our active barrier. This convergence event is signalled by a barrier
+    // on all the events from the barriers we have inserted into each queue.
+    // Use the first command list as our convergence command list.
+    ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
+
+    // Create an event list. It will take ownership over all relevant events so
+    // we relinquish ownership and let it keep all events it needs.
+    _ur_ze_event_list_t BaseWaitList;
+    UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
+        EventWaitVector.size(),
+        reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
+        Queue, ConvergenceCmdList->second.isCopy(Queue)));
+
+    // Insert a barrier with the events from each command-queue into the
+    // convergence command list. The resulting event signals the convergence of
+    // all barriers.
+    UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
+                                     ResultEvent, IsInternal));
+  } else {
+    // If there is only a single queue then insert a barrier and the single
+    // result event can be used as our active barrier and used as the return
+    // event. Take into account whether output event is discarded or not.
+    _ur_ze_event_list_t waitlist;
+    UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
+                                     IsInternal));
+  }
+
+  // Execute each command list so the barriers can be encountered.
+  for (ur_command_list_ptr_t &CmdList : CmdLists) {
+    bool IsCopy =
+        CmdList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(Queue));
+    const auto &CommandBatch =
+        (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+    // Only batch if the matching CmdList is already open.
+    OkToBatch = CommandBatch.OpenCommandList == CmdList;
+
+    UR_CALL(
+        Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
+  }
+
+  UR_CALL(Queue->ActiveBarriers.clear());
+  Queue->ActiveBarriers.add(ResultEvent);
+  if (OutEvent) {
+    *OutEvent = ResultEvent;
+  }
+  return UR_RESULT_SUCCESS;
 }
+/*
+ur_result_t urEnqueueEventsWaitWithBarrierExt(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    const ur_exp_enqueue_ext_properties_t
+        *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
+properties uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating that
+                        ///< all previously enqueued commands must be complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+    bool InterruptBased = EnqueueExtProp && (EnqueueExtProp->flags &
+UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS); ur_event_handle_t ResultEvent =
+nullptr;
+
+    if (InterruptBased) {
+        // Create the event with interrupt-based properties
+        ur_command_list_ptr_t CmdList;
+        UR_CALL(Queue->Context->getAvailableCommandList(Queue, CmdList, false,
+NumEventsInWaitList, EventWaitList, true, nullptr));
+        UR_CALL(createEventAndAssociateQueue(Queue, &ResultEvent,
+UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, true, false, std::nullopt,
+InterruptBased));
+    }
+
+    ur_result_t result = ur::level_zero::urEnqueueEventsWaitWithBarrier(
+        Queue, NumEventsInWaitList, EventWaitList, OutEvent);
+
+    if (InterruptBased && OutEvent) {
+        *OutEvent = ResultEvent;
+    }
+    return result;
+}
+
+*/
 
 ur_result_t urEventGetInfo(
     ur_event_handle_t Event,  ///< [in] handle of the event object
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index e493dcc60a..cae3d3d989 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1896,7 +1896,9 @@ ur_result_t createEventAndAssociateQueue(
     UR_CALL(EventCreate(
         Queue->Context, Queue, IsMultiDevice, HostVisible.value(), Event,
         Queue->CounterBasedEventsEnabled, false /*ForceDisableProfiling*/,
-        HostVisible.has_value() ? true : Queue->interruptBasedEventsEnabled()));
+        InterruptBasedEvents.has_value()
+            ? InterruptBasedEvents.value()
+            : Queue->interruptBasedEventsEnabled()));
 
   (*Event)->UrQueue = Queue;
   (*Event)->CommandType = CommandType;

From f8a842c9350ed5b7528f1579534c7eac730350c9 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Thu, 21 Nov 2024 18:05:44 -0800
Subject: [PATCH 060/148] [L0] Cleaned up urEnqueueEventsWaitWithBarrier(Ext)
 with helper option

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 430 ++++++---------------------
 source/adapters/level_zero/event.hpp |   6 +
 2 files changed, 93 insertions(+), 343 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 4d7991b1a6..b9eb6bacd6 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -156,7 +156,7 @@ static const bool InOrderBarrierBySignal = [] {
   return (UrRet ? std::atoi(UrRet) : true);
 }();
 
-ur_result_t urEnqueueEventsWaitWithBarrier(
+ur_result_t EnqueueEventsWaitWithBarrier(
     ur_queue_handle_t Queue,      ///< [in] handle of the queue object
     uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
     const ur_event_handle_t
@@ -166,69 +166,69 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
                         ///< the numEventsInWaitList must be 0, indicating that
                         ///< all previously enqueued commands must be complete.
     ur_event_handle_t
-        *OutEvent ///< [in,out][optional] return an event object that identifies
-                  ///< this particular command instance.
-) {
+        *OutEvent, ///< [in,out][optional] return an event object that
+                   ///< identifies this particular command instance.
+    bool InterruptBasedEventsEnabled) {
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
 
   // Helper function for appending a barrier to a command list.
-  auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
-                                           _ur_ze_event_list_t &EventWaitList,
-                                           ur_event_handle_t &Event,
-                                           bool IsInternal) {
-    UR_CALL(createEventAndAssociateQueue(Queue, &Event,
-                                         UR_COMMAND_EVENTS_WAIT_WITH_BARRIER,
-                                         CmdList, IsInternal, false));
-
-    Event->WaitList = EventWaitList;
-
-    // For in-order queue we don't need a real barrier, just wait for
-    // requested events in potentially different queues and add a "barrier"
-    // event signal because it is already guaranteed that previous commands
-    // in this queue are completed when the signal is started.
-    //
-    // Only consideration here is that when profiling is used, signalEvent
-    // cannot be used if EventWaitList.Lenght == 0. In those cases, we need
-    // to fallback directly to barrier to have correct timestamps. See here:
-    // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
-    //
-    // TODO: this and other special handling of in-order queues to be
-    // updated when/if Level Zero adds native support for in-order queues.
-    //
-    if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
-        !Queue->isProfilingEnabled()) {
-      if (EventWaitList.Length) {
-        if (CmdList->second.IsInOrderList) {
-          for (unsigned i = EventWaitList.Length; i-- > 0;) {
-            // If the event is a multidevice event, then given driver in order
-            // lists, we cannot include this into the wait event list due to
-            // driver limitations.
-            if (EventWaitList.UrEventList[i]->IsMultiDevice) {
-              EventWaitList.Length--;
-              if (EventWaitList.Length != i) {
-                std::swap(EventWaitList.UrEventList[i],
-                          EventWaitList.UrEventList[EventWaitList.Length]);
-                std::swap(EventWaitList.ZeEventList[i],
-                          EventWaitList.ZeEventList[EventWaitList.Length]);
+  auto insertBarrierIntoCmdList =
+      [&Queue](ur_command_list_ptr_t CmdList,
+               _ur_ze_event_list_t &EventWaitList, ur_event_handle_t &Event,
+               bool IsInternal, bool InterruptBasedEventsEnabled) {
+        UR_CALL(createEventAndAssociateQueue(
+            Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList,
+            IsInternal, InterruptBasedEventsEnabled));
+
+        Event->WaitList = EventWaitList;
+
+        // For in-order queue we don't need a real barrier, just wait for
+        // requested events in potentially different queues and add a "barrier"
+        // event signal because it is already guaranteed that previous commands
+        // in this queue are completed when the signal is started.
+        //
+        // Only consideration here is that when profiling is used, signalEvent
+        // cannot be used if EventWaitList.Lenght == 0. In those cases, we need
+        // to fallback directly to barrier to have correct timestamps. See here:
+        // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
+        //
+        // TODO: this and other special handling of in-order queues to be
+        // updated when/if Level Zero adds native support for in-order queues.
+        //
+        if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
+            !Queue->isProfilingEnabled()) {
+          if (EventWaitList.Length) {
+            if (CmdList->second.IsInOrderList) {
+              for (unsigned i = EventWaitList.Length; i-- > 0;) {
+                // If the event is a multidevice event, then given driver in
+                // order lists, we cannot include this into the wait event list
+                // due to driver limitations.
+                if (EventWaitList.UrEventList[i]->IsMultiDevice) {
+                  EventWaitList.Length--;
+                  if (EventWaitList.Length != i) {
+                    std::swap(EventWaitList.UrEventList[i],
+                              EventWaitList.UrEventList[EventWaitList.Length]);
+                    std::swap(EventWaitList.ZeEventList[i],
+                              EventWaitList.ZeEventList[EventWaitList.Length]);
+                  }
+                }
               }
             }
+            ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+                       (CmdList->first, EventWaitList.Length,
+                        EventWaitList.ZeEventList));
           }
+          ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                     (CmdList->first, Event->ZeEvent));
+        } else {
+          ZE2UR_CALL(zeCommandListAppendBarrier,
+                     (CmdList->first, Event->ZeEvent, EventWaitList.Length,
+                      EventWaitList.ZeEventList));
         }
-        ZE2UR_CALL(
-            zeCommandListAppendWaitOnEvents,
-            (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
-      }
-      ZE2UR_CALL(zeCommandListAppendSignalEvent,
-                 (CmdList->first, Event->ZeEvent));
-    } else {
-      ZE2UR_CALL(zeCommandListAppendBarrier,
-                 (CmdList->first, Event->ZeEvent, EventWaitList.Length,
-                  EventWaitList.ZeEventList));
-    }
 
-    return UR_RESULT_SUCCESS;
-  };
+        return UR_RESULT_SUCCESS;
+      };
 
   // If the queue is in-order then each command in it effectively acts as a
   // barrier, so we don't need to do anything except if we were requested
@@ -285,7 +285,7 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
 
     // Insert the barrier into the command-list and execute.
     UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
-                                     IsInternal));
+                                     IsInternal, InterruptBasedEventsEnabled));
 
     UR_CALL(
         Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
@@ -367,8 +367,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
     std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
     for (size_t I = 0; I < CmdLists.size(); ++I) {
       _ur_ze_event_list_t waitlist;
-      UR_CALL(insertBarrierIntoCmdList(
-          CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
+      UR_CALL(insertBarrierIntoCmdList(CmdLists[I], waitlist,
+                                       EventWaitVector[I], true /*IsInternal*/,
+                                       InterruptBasedEventsEnabled));
     }
     // If there were multiple queues we need to create a "convergence" event to
     // be our active barrier. This convergence event is signalled by a barrier
@@ -388,14 +389,15 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
     // convergence command list. The resulting event signals the convergence of
     // all barriers.
     UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
-                                     ResultEvent, IsInternal));
+                                     ResultEvent, IsInternal,
+                                     InterruptBasedEventsEnabled));
   } else {
     // If there is only a single queue then insert a barrier and the single
     // result event can be used as our active barrier and used as the return
     // event. Take into account whether output event is discarded or not.
     _ur_ze_event_list_t waitlist;
     UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
-                                     IsInternal));
+                                     IsInternal, InterruptBasedEventsEnabled));
   }
 
   // Execute each command list so the barriers can be encountered.
@@ -419,11 +421,8 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urEnqueueEventsWaitWithBarrierExt(
-    ur_queue_handle_t Queue, ///< [in] handle of the queue object
-    const ur_exp_enqueue_ext_properties_t
-        *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
-                         ///< properties
+ur_result_t urEnqueueEventsWaitWithBarrier(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
     uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
     const ur_event_handle_t
         *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
@@ -435,266 +434,18 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
 ) {
-  bool InterruptBased =
-      EnqueueExtProp &&
-      (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
-  if (!InterruptBased) {
-    return ur::level_zero::urEnqueueEventsWaitWithBarrier(
-        Queue, NumEventsInWaitList, EventWaitList, OutEvent);
-  }
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Helper function for appending a barrier to a command list.
-  auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
-                                           _ur_ze_event_list_t &EventWaitList,
-                                           ur_event_handle_t &Event,
-                                           bool IsInternal) {
-    UR_CALL(createEventAndAssociateQueue(
-        Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, IsInternal,
-        false, std::nullopt, true));
-    Event->WaitList = EventWaitList;
-
-    // For in-order queue we don't need a real barrier, just wait for
-    // requested events in potentially different queues and add a "barrier"
-    // event signal because it is already guaranteed that previous commands
-    // in this queue are completed when the signal is started.
-    //
-    // Only consideration here is that when profiling is used, signalEvent
-    // cannot be used if EventWaitList.Length == 0. In those cases, we need
-    // to fallback directly to barrier to have correct timestamps. See here:
-    // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
-    //
-    // TODO: this and other special handling of in-order queues to be
-    // updated when/if Level Zero adds native support for in-order queues.
-    //
-    if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
-        !Queue->isProfilingEnabled()) {
-      if (EventWaitList.Length) {
-        if (CmdList->second.IsInOrderList) {
-          for (unsigned i = EventWaitList.Length; i-- > 0;) {
-            // If the event is a multidevice event, then given driver in order
-            // lists, we cannot include this into the wait event list due to
-            // driver limitations.
-            if (EventWaitList.UrEventList[i]->IsMultiDevice) {
-              EventWaitList.Length--;
-              if (EventWaitList.Length != i) {
-                std::swap(EventWaitList.UrEventList[i],
-                          EventWaitList.UrEventList[EventWaitList.Length]);
-                std::swap(EventWaitList.ZeEventList[i],
-                          EventWaitList.ZeEventList[EventWaitList.Length]);
-              }
-            }
-          }
-        }
-        ZE2UR_CALL(
-            zeCommandListAppendWaitOnEvents,
-            (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
-      }
-      ZE2UR_CALL(zeCommandListAppendSignalEvent,
-                 (CmdList->first, Event->ZeEvent));
-    } else {
-      ZE2UR_CALL(zeCommandListAppendBarrier,
-                 (CmdList->first, Event->ZeEvent, EventWaitList.Length,
-                  EventWaitList.ZeEventList));
-    }
-
-    return UR_RESULT_SUCCESS;
-  };
-
-  // If the queue is in-order then each command in it effectively acts as a
-  // barrier, so we don't need to do anything except if we were requested
-  // a "barrier" event to be created. Or if we need to wait for events in
-  // potentially different queues.
-  //
-  if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) {
-    return UR_RESULT_SUCCESS;
-  }
-
-  ur_event_handle_t ResultEvent = nullptr;
-  bool IsInternal = OutEvent == nullptr;
-  // For in-order queue and wait-list which is empty or has events from
-  // the same queue just use the last command event as the barrier event.
-  // This optimization is disabled when profiling is enabled to ensure
-  // accurate profiling values & the overhead that profiling incurs.
-  if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() &&
-      WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList,
-                                            EventWaitList) &&
-      Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
-    UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
-    ResultEvent = Queue->LastCommandEvent;
-    if (OutEvent) {
-      *OutEvent = ResultEvent;
-    }
-    return UR_RESULT_SUCCESS;
-  }
-
-  // Indicator for whether batching is allowed. This may be changed later in
-  // this function, but allow it by default.
-  bool OkToBatch = true;
-
-  // If we have a list of events to make the barrier from, then we can create a
-  // barrier on these and use the resulting event as our future barrier.
-  // We use the same approach if
-  // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
-  // positive value.
-  // We use the same approach if we have in-order queue because every command
-  // depends on previous one, so we don't need to insert barrier to multiple
-  // command lists.
-  if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
-      Queue->isInOrderQueue()) {
-    // Retain the events as they will be owned by the result event.
-    _ur_ze_event_list_t TmpWaitList;
-    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
-        NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
-
-    // Get an arbitrary command-list in the queue.
-    ur_command_list_ptr_t CmdList;
-    UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
-        EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
-
-    // Insert the barrier into the command-list and execute.
-    UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
-                                     IsInternal));
-
-    UR_CALL(
-        Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
-
-    // Because of the dependency between commands in the in-order queue we don't
-    // need to keep track of any active barriers if we have in-order queue.
-    if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
-      auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
-      Queue->ActiveBarriers.add(UREvent);
-    }
-
-    if (OutEvent) {
-      *OutEvent = ResultEvent;
-    }
-    return UR_RESULT_SUCCESS;
-  }
-
-  // Since there are no events to explicitly create a barrier for, we are
-  // inserting a queue-wide barrier.
-
-  // Command list(s) for putting barriers.
-  std::vector<ur_command_list_ptr_t> CmdLists;
-
-  // There must be at least one L0 queue.
-  auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
-  auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
-  UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
-            UR_RESULT_ERROR_INVALID_QUEUE);
-
-  size_t NumQueues = 0;
-  for (auto &QueueMap :
-       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-    for (auto &QueueGroup : QueueMap)
-      NumQueues += QueueGroup.second.ZeQueues.size();
-
-  OkToBatch = true;
-  // Get an available command list tied to each command queue. We need
-  // these so a queue-wide barrier can be inserted into each command
-  // queue.
-  CmdLists.reserve(NumQueues);
-  for (auto &QueueMap :
-       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-    for (auto &QueueGroup : QueueMap) {
-      bool UseCopyEngine =
-          QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute;
-      if (Queue->UsingImmCmdLists) {
-        // If immediate command lists are being used, each will act as their own
-        // queue, so we must insert a barrier into each.
-        for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
-          if (ImmCmdList != Queue->CommandListMap.end())
-            CmdLists.push_back(ImmCmdList);
-      } else {
-        for (auto ZeQueue : QueueGroup.second.ZeQueues) {
-          if (ZeQueue) {
-            ur_command_list_ptr_t CmdList;
-            UR_CALL(Queue->Context->getAvailableCommandList(
-                Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
-                EventWaitList, OkToBatch, &ZeQueue));
-            CmdLists.push_back(CmdList);
-          }
-        }
-      }
-    }
-
-  // If no activity has occurred on the queue then there will be no cmdlists.
-  // We need one for generating an Event, so create one.
-  if (CmdLists.size() == 0) {
-    // Get any available command list.
-    ur_command_list_ptr_t CmdList;
-    UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
-        EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
-    CmdLists.push_back(CmdList);
-  }
-
-  if (CmdLists.size() > 1) {
-    // Insert a barrier into each unique command queue using the available
-    // command-lists.
-    std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
-    for (size_t I = 0; I < CmdLists.size(); ++I) {
-      _ur_ze_event_list_t waitlist;
-      UR_CALL(insertBarrierIntoCmdList(
-          CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
-    }
-    // If there were multiple queues we need to create a "convergence" event to
-    // be our active barrier. This convergence event is signalled by a barrier
-    // on all the events from the barriers we have inserted into each queue.
-    // Use the first command list as our convergence command list.
-    ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
-
-    // Create an event list. It will take ownership over all relevant events so
-    // we relinquish ownership and let it keep all events it needs.
-    _ur_ze_event_list_t BaseWaitList;
-    UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
-        EventWaitVector.size(),
-        reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
-        Queue, ConvergenceCmdList->second.isCopy(Queue)));
-
-    // Insert a barrier with the events from each command-queue into the
-    // convergence command list. The resulting event signals the convergence of
-    // all barriers.
-    UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
-                                     ResultEvent, IsInternal));
-  } else {
-    // If there is only a single queue then insert a barrier and the single
-    // result event can be used as our active barrier and used as the return
-    // event. Take into account whether output event is discarded or not.
-    _ur_ze_event_list_t waitlist;
-    UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
-                                     IsInternal));
-  }
-
-  // Execute each command list so the barriers can be encountered.
-  for (ur_command_list_ptr_t &CmdList : CmdLists) {
-    bool IsCopy =
-        CmdList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(Queue));
-    const auto &CommandBatch =
-        (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
-    // Only batch if the matching CmdList is already open.
-    OkToBatch = CommandBatch.OpenCommandList == CmdList;
-
-    UR_CALL(
-        Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
-  }
-
-  UR_CALL(Queue->ActiveBarriers.clear());
-  Queue->ActiveBarriers.add(ResultEvent);
-  if (OutEvent) {
-    *OutEvent = ResultEvent;
-  }
-  return UR_RESULT_SUCCESS;
+  return static_cast<ur_result_t (*)(
+      ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
+      ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
+      Queue, NumEventsInWaitList, EventWaitList, OutEvent,
+      Queue->interruptBasedEventsEnabled());
 }
-/*
+
 ur_result_t urEnqueueEventsWaitWithBarrierExt(
     ur_queue_handle_t Queue, ///< [in] handle of the queue object
     const ur_exp_enqueue_ext_properties_t
         *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
-properties uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
     const ur_event_handle_t
         *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
                         ///< pointer to a list of events that must be complete
@@ -705,31 +456,24 @@ properties uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
 ) {
-    bool InterruptBased = EnqueueExtProp && (EnqueueExtProp->flags &
-UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS); ur_event_handle_t ResultEvent =
-nullptr;
-
-    if (InterruptBased) {
-        // Create the event with interrupt-based properties
-        ur_command_list_ptr_t CmdList;
-        UR_CALL(Queue->Context->getAvailableCommandList(Queue, CmdList, false,
-NumEventsInWaitList, EventWaitList, true, nullptr));
-        UR_CALL(createEventAndAssociateQueue(Queue, &ResultEvent,
-UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, true, false, std::nullopt,
-InterruptBased));
-    }
-
-    ur_result_t result = ur::level_zero::urEnqueueEventsWaitWithBarrier(
-        Queue, NumEventsInWaitList, EventWaitList, OutEvent);
-
-    if (InterruptBased && OutEvent) {
-        *OutEvent = ResultEvent;
-    }
-    return result;
+  bool InterruptBased =
+      EnqueueExtProp &&
+      (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
+  if (InterruptBased) {
+    // Create the event with interrupt-based properties
+    return static_cast<ur_result_t (*)(
+        ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
+        ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
+        Queue, NumEventsInWaitList, EventWaitList, OutEvent, true);
+  } else {
+    return static_cast<ur_result_t (*)(
+        ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
+        ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
+        Queue, NumEventsInWaitList, EventWaitList, OutEvent,
+        Queue->interruptBasedEventsEnabled());
+  }
 }
 
-*/
-
 ur_result_t urEventGetInfo(
     ur_event_handle_t Event,  ///< [in] handle of the event object
     ur_event_info_t PropName, ///< [in] the name of the event property to query
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index de018e7060..d894b2ef4e 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -279,6 +279,12 @@ template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle);
 ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
                                   bool SetEventCompleted);
 
+ur_result_t EnqueueEventsWaitWithBarrier(ur_queue_handle_t Queue,
+                                         uint32_t NumEventsInWaitList,
+                                         const ur_event_handle_t *EventList,
+                                         ur_event_handle_t *OutEvent,
+                                         bool InterruptBasedEventsEnabled);
+
 // Get value of device scope events env var setting or default setting
 static const EventsScope DeviceEventsSetting = [] {
   char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS");

From 2690fb4d6a9d9f2c9a1d441f9e0379e561cdad91 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Thu, 21 Nov 2024 18:37:58 -0800
Subject: [PATCH 061/148] [L0] Rebased against top of main

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/command_buffer.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 14f2db4d84..de2f20ca38 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -215,7 +215,8 @@ ur_result_t createSyncPointAndGetZeEvents(
   UR_CALL(EventCreate(CommandBuffer->Context, nullptr /*Queue*/,
                       false /*IsMultiDevice*/, HostVisible, &LaunchEvent,
                       false /*CounterBasedEventEnabled*/,
-                      !CommandBuffer->IsProfilingEnabled, false));
+                      !CommandBuffer->IsProfilingEnabled,
+                      false /*InterruptBasedEventEnabled*/));
   LaunchEvent->CommandType = CommandType;
   ZeLaunchEvent = LaunchEvent->ZeEvent;
 
@@ -680,13 +681,15 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
     if (Device->hasMainCopyEngine()) {
       UR_CALL(EventCreate(Context, nullptr /*Queue*/, false, false,
                           &CopyFinishedEvent, UseCounterBasedEvents,
-                          !EnableProfiling));
+                          !EnableProfiling,
+                          false /*InterruptBasedEventEnabled*/));
     }
 
     if (EnableProfiling) {
       UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                           false /*HostVisible*/, &ComputeFinishedEvent,
-                          UseCounterBasedEvents, !EnableProfiling));
+                          UseCounterBasedEvents, !EnableProfiling,
+                          false /*InterruptBasedEventEnabled*/));
     }
   }
 
@@ -695,7 +698,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
   if (WaitEventPath) {
     UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                         false /*HostVisible*/, &WaitEvent,
-                        false /*CounterBasedEventEnabled*/, !EnableProfiling));
+                        false /*CounterBasedEventEnabled*/, !EnableProfiling,
+                        false /*InterruptBasedEventEnabled*/));
   }
 
   // Create ZeCommandListResetEvents only if counter-based events are not being
@@ -707,7 +711,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
   if (!UseCounterBasedEvents) {
     UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                         false /*HostVisible*/, &AllResetEvent,
-                        false /*CounterBasedEventEnabled*/, !EnableProfiling));
+                        false /*CounterBasedEventEnabled*/, !EnableProfiling,
+                        false /*InterruptBasedEventEnabled*/));
 
     UR_CALL(createMainCommandList(Context, Device, false, false, false,
                                   ZeCommandListResetEvents));
@@ -715,7 +720,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
     // The ExecutionFinishedEvent is only waited on by ZeCommandListResetEvents.
     UR_CALL(EventCreate(Context, nullptr /*Queue*/, false /*IsMultiDevice*/,
                         false /*HostVisible*/, &ExecutionFinishedEvent,
-                        false /*CounterBasedEventEnabled*/, !EnableProfiling));
+                        false /*CounterBasedEventEnabled*/, !EnableProfiling,
+                        false /*InterruptBased*/));
   }
 
   try {

From 4c4509e6b9a4ff140d1308b1fa5584e94143d24c Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Mon, 25 Nov 2024 16:40:59 -0800
Subject: [PATCH 062/148] [L0] Interrupt-based event implementation

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/context.cpp | 17 ++++--
 source/adapters/level_zero/context.hpp | 85 ++++++++++++++++++++------
 source/adapters/level_zero/device.cpp  |  2 +-
 source/adapters/level_zero/event.cpp   |  9 +--
 source/adapters/level_zero/queue.cpp   | 29 +++++----
 source/adapters/level_zero/queue.hpp   |  6 +-
 6 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index fe0f679a7a..a86e8d5308 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -488,9 +488,9 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
   if (Device) {
     ZeDevice = Device->ZeDevice;
   }
-  std::list<ze_event_pool_handle_t> *ZePoolCache =
-      getZeEventPoolCache(HostVisible, ProfilingEnabled,
-                          CounterBasedEventEnabled, UsingImmCmdList, ZeDevice);
+  std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
+      HostVisible, ProfilingEnabled, CounterBasedEventEnabled, UsingImmCmdList,
+      InterruptBasedEventEnabled, ZeDevice);
 
   if (!ZePoolCache->empty()) {
     if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
@@ -572,7 +572,7 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
 
 ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
     bool HostVisible, bool WithProfiling, ur_device_handle_t Device,
-    bool CounterBasedEventEnabled) {
+    bool CounterBasedEventEnabled, bool InterruptBasedEventEnabled) {
   std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
   auto Cache = getEventCache(HostVisible, WithProfiling, Device,
                              CounterBasedEventEnabled);
@@ -585,6 +585,12 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
 
   auto It = Cache->begin();
   ur_event_handle_t Event = *It;
+  if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) {
+    return nullptr;
+  }
+  if (Event->InterruptBasedEventsEnabled != InterruptBasedEventEnabled) {
+    return nullptr;
+  }
   Cache->erase(It);
   // We have to reset event before using it.
   Event->reset();
@@ -636,7 +642,8 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
 
   std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
       Event->isHostVisible(), Event->isProfilingEnabled(),
-      Event->CounterBasedEventsEnabled, UsingImmediateCommandlists, ZeDevice);
+      Event->CounterBasedEventsEnabled, UsingImmediateCommandlists,
+      Event->InterruptBasedEventsEnabled, ZeDevice);
 
   // Put the empty pool to the cache of the pools.
   if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 05e3369274..fd87c8f3a4 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -168,9 +168,9 @@ struct ur_context_handle_t_ : _ur_object {
   // head.
   //
   // Cache of event pools to which host-visible events are added to.
-  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{12};
+  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{30};
   std::vector<std::unordered_map<ze_device_handle_t, size_t>>
-      ZeEventPoolCacheDeviceMap{12};
+      ZeEventPoolCacheDeviceMap{30};
 
   // This map will be used to determine if a pool is full or not
   // by storing number of empty slots available in the pool.
@@ -224,7 +224,8 @@ struct ur_context_handle_t_ : _ur_object {
   ur_event_handle_t getEventFromContextCache(bool HostVisible,
                                              bool WithProfiling,
                                              ur_device_handle_t Device,
-                                             bool CounterBasedEventEnabled);
+                                             bool CounterBasedEventEnabled,
+                                             bool InterruptBasedEventEnabled);
 
   // Add ur_event_handle_t to cache.
   void addEventToContextCache(ur_event_handle_t);
@@ -235,17 +236,29 @@ struct ur_context_handle_t_ : _ur_object {
     HostVisibleCounterBasedRegularCacheType,
     HostInvisibleCounterBasedRegularCacheType,
     HostVisibleCounterBasedImmediateCacheType,
-    HostInvisibleCounterBasedImmediateCacheType
+    HostInvisibleCounterBasedImmediateCacheType,
+
+    HostVisibleInterruptBasedRegularCacheType,
+    HostInvisibleInterruptBasedRegularCacheType,
+    HostVisibleInterruptBasedImmediateCacheType,
+    HostInvisibleInterruptBasedImmediateCacheType,
+
+    HostVisibleInterruptAndCounterBasedRegularCacheType,
+    HostInvisibleInterruptAndCounterBasedRegularCacheType,
+    HostVisibleInterruptAndCounterBasedImmediateCacheType,
+    HostInvisibleInterruptAndCounterBasedImmediateCacheType
   };
 
   std::list<ze_event_pool_handle_t> *
   getZeEventPoolCache(bool HostVisible, bool WithProfiling,
                       bool CounterBasedEventEnabled, bool UsingImmediateCmdList,
+                      bool InterruptBasedEventEnabled,
                       ze_device_handle_t ZeDevice) {
     EventPoolCacheType CacheType;
 
     calculateCacheIndex(HostVisible, CounterBasedEventEnabled,
-                        UsingImmediateCmdList, CacheType);
+                        InterruptBasedEventEnabled, UsingImmediateCmdList,
+                        CacheType);
     if (ZeDevice) {
       auto ZeEventPoolCacheMap =
           WithProfiling ? &ZeEventPoolCacheDeviceMap[CacheType * 2]
@@ -265,23 +278,57 @@ struct ur_context_handle_t_ : _ur_object {
   ur_result_t calculateCacheIndex(bool HostVisible,
                                   bool CounterBasedEventEnabled,
                                   bool UsingImmediateCmdList,
+                                  bool InterruptBasedEventEnabled,
                                   EventPoolCacheType &CacheType) {
-    if (CounterBasedEventEnabled && HostVisible && !UsingImmediateCmdList) {
-      CacheType = HostVisibleCounterBasedRegularCacheType;
-    } else if (CounterBasedEventEnabled && !HostVisible &&
-               !UsingImmediateCmdList) {
-      CacheType = HostInvisibleCounterBasedRegularCacheType;
-    } else if (CounterBasedEventEnabled && HostVisible &&
-               UsingImmediateCmdList) {
-      CacheType = HostVisibleCounterBasedImmediateCacheType;
-    } else if (CounterBasedEventEnabled && !HostVisible &&
-               UsingImmediateCmdList) {
-      CacheType = HostInvisibleCounterBasedImmediateCacheType;
-    } else if (!CounterBasedEventEnabled && HostVisible) {
-      CacheType = HostVisibleCacheType;
+    if (InterruptBasedEventEnabled) {
+      if (CounterBasedEventEnabled) {
+        if (HostVisible) {
+          if (UsingImmediateCmdList) {
+            CacheType = HostVisibleInterruptAndCounterBasedImmediateCacheType;
+          } else {
+            CacheType = HostVisibleInterruptAndCounterBasedRegularCacheType;
+          }
+        } else {
+          if (UsingImmediateCmdList) {
+            CacheType = HostInvisibleInterruptAndCounterBasedImmediateCacheType;
+          } else {
+            CacheType = HostInvisibleInterruptAndCounterBasedRegularCacheType;
+          }
+        }
+      } else {
+        if (HostVisible) {
+          if (UsingImmediateCmdList) {
+            CacheType = HostVisibleInterruptBasedImmediateCacheType;
+          } else {
+            CacheType = HostVisibleInterruptBasedRegularCacheType;
+          }
+        } else {
+          if (UsingImmediateCmdList) {
+            CacheType = HostInvisibleInterruptBasedImmediateCacheType;
+          } else {
+            CacheType = HostInvisibleInterruptBasedRegularCacheType;
+          }
+        }
+      }
     } else {
-      CacheType = HostInvisibleCacheType;
+      if (CounterBasedEventEnabled && HostVisible && !UsingImmediateCmdList) {
+        CacheType = HostVisibleCounterBasedRegularCacheType;
+      } else if (CounterBasedEventEnabled && !HostVisible &&
+                 !UsingImmediateCmdList) {
+        CacheType = HostInvisibleCounterBasedRegularCacheType;
+      } else if (CounterBasedEventEnabled && HostVisible &&
+                 UsingImmediateCmdList) {
+        CacheType = HostVisibleCounterBasedImmediateCacheType;
+      } else if (CounterBasedEventEnabled && !HostVisible &&
+                 UsingImmediateCmdList) {
+        CacheType = HostInvisibleCounterBasedImmediateCacheType;
+      } else if (!CounterBasedEventEnabled && HostVisible) {
+        CacheType = HostVisibleCacheType;
+      } else {
+        CacheType = HostInvisibleCacheType;
+      }
     }
+
     return UR_RESULT_SUCCESS;
   }
 
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index dd0dafc8aa..7cf715a979 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -486,7 +486,7 @@ ur_result_t urDeviceGetInfo(
     // TODO: To find out correct value
     return ReturnValue("");
   case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP:
-    return ReturnValue(UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP);
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_QUEUE_PROPERTIES:
     return ReturnValue(
         ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index b9eb6bacd6..ab8580f833 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -438,7 +438,7 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
       ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
       ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
       Queue, NumEventsInWaitList, EventWaitList, OutEvent,
-      Queue->interruptBasedEventsEnabled());
+      Queue == nullptr ? false : Queue->InterruptBasedEventsEnabled);
 }
 
 ur_result_t urEnqueueEventsWaitWithBarrierExt(
@@ -470,7 +470,7 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
         ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
         ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
         Queue, NumEventsInWaitList, EventWaitList, OutEvent,
-        Queue->interruptBasedEventsEnabled());
+        Queue ? Queue->InterruptBasedEventsEnabled : false);
   }
 }
 
@@ -1342,7 +1342,8 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
   }
 
   if (auto CachedEvent = Context->getEventFromContextCache(
-          HostVisible, ProfilingEnabled, Device, CounterBasedEventEnabled)) {
+          HostVisible, ProfilingEnabled, Device, CounterBasedEventEnabled,
+          InterruptBasedEventEnabled)) {
     *RetEvent = CachedEvent;
     return UR_RESULT_SUCCESS;
   }
@@ -1355,7 +1356,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
   if (auto Res = Context->getFreeSlotInExistingOrNewPool(
           ZeEventPool, Index, HostVisible, ProfilingEnabled, Device,
           CounterBasedEventEnabled, UsingImmediateCommandlists,
-          Queue->interruptBasedEventsEnabled()))
+          InterruptBasedEventEnabled))
     return Res;
 
   ZeStruct<ze_event_desc_t> ZeEventDesc;
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index cae3d3d989..826f8887dd 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1192,10 +1192,20 @@ ur_queue_handle_t_::ur_queue_handle_t_(
     }
     return std::atoi(UrRet) != 0;
   }();
+  static const bool useInterruptBasedEvents = [] {
+    const char *UrRet = std::getenv("UR_L0_USE_INTERRUPT_BASED_EVENTS");
+    if (!UrRet) {
+      return true;
+    }
+    return std::atoi(UrRet) != 0;
+  }();
   this->CounterBasedEventsEnabled =
       UsingImmCmdLists && isInOrderQueue() && Device->useDriverInOrderLists() &&
       useDriverCounterBasedEvents &&
       Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound;
+  this->InterruptBasedEventsEnabled = useInterruptBasedEvents &&
+                                      isLowPowerEvents() && isInOrderQueue() &&
+                                      Device->useDriverInOrderLists();
 }
 
 void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) {
@@ -1494,11 +1504,6 @@ bool ur_queue_handle_t_::doReuseDiscardedEvents() {
   return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
 }
 
-bool ur_queue_handle_t_::interruptBasedEventsEnabled() {
-  return isInOrderQueue() && Device->useDriverInOrderLists() &&
-         isLowPowerEvents();
-}
-
 ur_result_t
 ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) {
   if (LastCommandEvent && LastCommandEvent->IsDiscarded) {
@@ -1877,10 +1882,12 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
 //        visible pool.
 // \param HostVisible tells if the event must be created in the
 //        host-visible pool. If not set then this function will decide.
-ur_result_t createEventAndAssociateQueue(
-    ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
-    ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
-    std::optional<bool> HostVisible, std::optional<bool> InterruptBasedEvents) {
+ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue,
+                                         ur_event_handle_t *Event,
+                                         ur_command_t CommandType,
+                                         ur_command_list_ptr_t CommandList,
+                                         bool IsInternal, bool IsMultiDevice,
+                                         std::optional<bool> HostVisible) {
 
   if (!HostVisible.has_value()) {
     // Internal/discarded events do not need host-scope visibility.
@@ -1896,9 +1903,7 @@ ur_result_t createEventAndAssociateQueue(
     UR_CALL(EventCreate(
         Queue->Context, Queue, IsMultiDevice, HostVisible.value(), Event,
         Queue->CounterBasedEventsEnabled, false /*ForceDisableProfiling*/,
-        InterruptBasedEvents.has_value()
-            ? InterruptBasedEvents.value()
-            : Queue->interruptBasedEventsEnabled()));
+        Queue->InterruptBasedEventsEnabled));
 
   (*Event)->UrQueue = Queue;
   (*Event)->CommandType = CommandType;
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 786f1bdd51..90e95e2c2e 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -375,6 +375,8 @@ struct ur_queue_handle_t_ : _ur_object {
   // Keeps track of whether we are using Counter-based Events
   bool CounterBasedEventsEnabled = false;
 
+  bool InterruptBasedEventsEnabled = false;
+
   // Map of all command lists used in this queue.
   ur_command_list_map_t CommandListMap;
 
@@ -533,8 +535,6 @@ struct ur_queue_handle_t_ : _ur_object {
   // queue.
   bool doReuseDiscardedEvents();
 
-  bool interruptBasedEventsEnabled();
-
   // Append command to provided command list to wait and reset the last event if
   // it is discarded and create new ur_event_handle_t wrapper using the same
   // native event and put it to the cache. We call this method after each
@@ -713,7 +713,7 @@ struct ur_queue_handle_t_ : _ur_object {
 ur_result_t createEventAndAssociateQueue(
     ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
     ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
-    std::optional<bool> HostVisible = std::nullopt, std::optional<bool> InterruptBasedEvents = std::nullopt);
+    std::optional<bool> HostVisible = std::nullopt);
 
 // This helper function checks to see if an event for a command can be included
 // at the end of a command list batch. This will only be true if the event does

From 4d15d9bddb85d627d3f56afc0d0d042508dc13c7 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Mon, 25 Nov 2024 17:06:34 -0800
Subject: [PATCH 063/148] [L0] Interrupt-based event implementation

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/context.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index fd87c8f3a4..5047c08161 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -257,7 +257,7 @@ struct ur_context_handle_t_ : _ur_object {
     EventPoolCacheType CacheType;
 
     calculateCacheIndex(HostVisible, CounterBasedEventEnabled,
-                        InterruptBasedEventEnabled, UsingImmediateCmdList,
+                        UsingImmediateCmdList, InterruptBasedEventEnabled,
                         CacheType);
     if (ZeDevice) {
       auto ZeEventPoolCacheMap =

From 5e18167c45fa5ce3bf57e21b5eb21d823b07350f Mon Sep 17 00:00:00 2001
From: y <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 02:36:21 +0000
Subject: [PATCH 064/148] [L0] Rebased off of top of main and addressed
 comments

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/context.cpp | 11 ++++++-----
 source/adapters/level_zero/context.hpp | 11 ++++++++---
 source/adapters/level_zero/device.cpp  |  2 --
 source/adapters/level_zero/event.cpp   |  7 ++++---
 source/adapters/level_zero/queue.cpp   | 12 ++----------
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index a86e8d5308..24dcff34ab 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -574,8 +574,9 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
     bool HostVisible, bool WithProfiling, ur_device_handle_t Device,
     bool CounterBasedEventEnabled, bool InterruptBasedEventEnabled) {
   std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-  auto Cache = getEventCache(HostVisible, WithProfiling, Device,
-                             CounterBasedEventEnabled);
+  auto Cache =
+      getEventCache(HostVisible, WithProfiling, Device,
+                    CounterBasedEventEnabled, InterruptBasedEventEnabled);
   if (Cache->empty()) {
     logger::info("Cache empty (Host Visible: {}, Profiling: {}, Counter: {}, "
                  "Device: {})",
@@ -611,9 +612,9 @@ void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) {
     Device = Event->UrQueue->Device;
   }
 
-  auto Cache =
-      getEventCache(Event->isHostVisible(), Event->isProfilingEnabled(), Device,
-                    Event->CounterBasedEventsEnabled);
+  auto Cache = getEventCache(
+      Event->isHostVisible(), Event->isProfilingEnabled(), Device,
+      Event->CounterBasedEventsEnabled, Event->InterruptBasedEventsEnabled);
   logger::info("Inserting {} event (Host Visible: {}, Profiling: {}, Counter: "
                "{}, Device: {}) into cache {}",
                Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 5047c08161..5a0549993a 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -374,9 +374,10 @@ struct ur_context_handle_t_ : _ur_object {
     EVENT_FLAG_HOST_VISIBLE = UR_BIT(0),
     EVENT_FLAG_WITH_PROFILING = UR_BIT(1),
     EVENT_FLAG_COUNTER = UR_BIT(2),
-    EVENT_FLAG_DEVICE = UR_BIT(3), // if set, subsequent bits are device id
+    EVENT_FLAG_INTERRUPT = UR_BIT(3),
+    EVENT_FLAG_DEVICE = UR_BIT(5), // if set, subsequent bits are device id
     MAX_EVENT_FLAG_BITS =
-        4, // this is used as an offset for embedding device id
+        6, // this is used as an offset for embedding device id
   };
 
   // Mutex to control operations on event caches.
@@ -388,7 +389,8 @@ struct ur_context_handle_t_ : _ur_object {
 
   // Get the cache of events for a provided scope and profiling mode.
   EventCache *getEventCache(bool HostVisible, bool WithProfiling,
-                            ur_device_handle_t Device, bool Counter) {
+                            ur_device_handle_t Device, bool Counter,
+                            bool Interrupt) {
 
     size_t index = 0;
     if (HostVisible) {
@@ -400,6 +402,9 @@ struct ur_context_handle_t_ : _ur_object {
     if (Counter) {
       index |= EVENT_FLAG_COUNTER;
     }
+    if (Interrupt) {
+      index |= EVENT_FLAG_INTERRUPT;
+    }
     if (Device) {
       index |= EVENT_FLAG_DEVICE | (*Device->Id << MAX_EVENT_FLAG_BITS);
     }
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 7cf715a979..2ca2ca72f6 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1157,8 +1157,6 @@ ur_result_t urDeviceGetInfo(
     return ReturnValue(true);
   case UR_DEVICE_INFO_USM_POOL_SUPPORT:
     return ReturnValue(true);
-  case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP:
-    return ReturnValue(false);
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
 #ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
     const auto ZeDeviceBlockArrayFlags =
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index ab8580f833..a0a5041847 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -457,8 +457,9 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
                   ///< this particular command instance.
 ) {
   bool InterruptBased =
-      EnqueueExtProp &&
-      (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
+      EnqueueExtProp
+          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS)
+          : false;
   if (InterruptBased) {
     // Create the event with interrupt-based properties
     return static_cast<ur_result_t (*)(
@@ -470,7 +471,7 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
         ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
         ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
         Queue, NumEventsInWaitList, EventWaitList, OutEvent,
-        Queue ? Queue->InterruptBasedEventsEnabled : false);
+        Queue->InterruptBasedEventsEnabled || false);
   }
 }
 
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 826f8887dd..71d033c86f 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1192,20 +1192,12 @@ ur_queue_handle_t_::ur_queue_handle_t_(
     }
     return std::atoi(UrRet) != 0;
   }();
-  static const bool useInterruptBasedEvents = [] {
-    const char *UrRet = std::getenv("UR_L0_USE_INTERRUPT_BASED_EVENTS");
-    if (!UrRet) {
-      return true;
-    }
-    return std::atoi(UrRet) != 0;
-  }();
   this->CounterBasedEventsEnabled =
       UsingImmCmdLists && isInOrderQueue() && Device->useDriverInOrderLists() &&
       useDriverCounterBasedEvents &&
       Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound;
-  this->InterruptBasedEventsEnabled = useInterruptBasedEvents &&
-                                      isLowPowerEvents() && isInOrderQueue() &&
-                                      Device->useDriverInOrderLists();
+  this->InterruptBasedEventsEnabled =
+      isLowPowerEvents() && isInOrderQueue() && Device->useDriverInOrderLists();
 }
 
 void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) {

From 3a3bc0644ff054c7d2059c6b17b42631f681f289 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 18:19:51 +0000
Subject: [PATCH 065/148] [L0] moved the implementation of
 EnqueueEventsWaitWithBarrier to EnqueueEventsWaitWithBarrierExt

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 86 +++++++++-------------------
 source/adapters/level_zero/event.hpp |  6 --
 2 files changed, 28 insertions(+), 64 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index a0a5041847..be08d19f1a 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -156,7 +156,7 @@ static const bool InOrderBarrierBySignal = [] {
   return (UrRet ? std::atoi(UrRet) : true);
 }();
 
-ur_result_t EnqueueEventsWaitWithBarrier(
+ur_result_t urEnqueueEventsWaitWithBarrier(
     ur_queue_handle_t Queue,      ///< [in] handle of the queue object
     uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
     const ur_event_handle_t
@@ -166,9 +166,33 @@ ur_result_t EnqueueEventsWaitWithBarrier(
                         ///< the numEventsInWaitList must be 0, indicating that
                         ///< all previously enqueued commands must be complete.
     ur_event_handle_t
-        *OutEvent, ///< [in,out][optional] return an event object that
-                   ///< identifies this particular command instance.
-    bool InterruptBasedEventsEnabled) {
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  return ur::level_zero::urEnqueueEventsWaitWithBarrierExt(
+      Queue, nullptr, NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
+ur_result_t urEnqueueEventsWaitWithBarrierExt(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    const ur_exp_enqueue_ext_properties_t
+        *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating that
+                        ///< all previously enqueued commands must be complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  bool InterruptBasedEventsEnabled =
+      EnqueueExtProp
+          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS) |
+                Queue->InterruptBasedEventsEnabled
+          : Queue->InterruptBasedEventsEnabled;
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
 
@@ -421,60 +445,6 @@ ur_result_t EnqueueEventsWaitWithBarrier(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urEnqueueEventsWaitWithBarrier(
-    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
-    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
-    const ur_event_handle_t
-        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
-                        ///< pointer to a list of events that must be complete
-                        ///< before this command can be executed. If nullptr,
-                        ///< the numEventsInWaitList must be 0, indicating that
-                        ///< all previously enqueued commands must be complete.
-    ur_event_handle_t
-        *OutEvent ///< [in,out][optional] return an event object that identifies
-                  ///< this particular command instance.
-) {
-  return static_cast<ur_result_t (*)(
-      ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
-      ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
-      Queue, NumEventsInWaitList, EventWaitList, OutEvent,
-      Queue == nullptr ? false : Queue->InterruptBasedEventsEnabled);
-}
-
-ur_result_t urEnqueueEventsWaitWithBarrierExt(
-    ur_queue_handle_t Queue, ///< [in] handle of the queue object
-    const ur_exp_enqueue_ext_properties_t
-        *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
-    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
-    const ur_event_handle_t
-        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
-                        ///< pointer to a list of events that must be complete
-                        ///< before this command can be executed. If nullptr,
-                        ///< the numEventsInWaitList must be 0, indicating that
-                        ///< all previously enqueued commands must be complete.
-    ur_event_handle_t
-        *OutEvent ///< [in,out][optional] return an event object that identifies
-                  ///< this particular command instance.
-) {
-  bool InterruptBased =
-      EnqueueExtProp
-          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS)
-          : false;
-  if (InterruptBased) {
-    // Create the event with interrupt-based properties
-    return static_cast<ur_result_t (*)(
-        ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
-        ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
-        Queue, NumEventsInWaitList, EventWaitList, OutEvent, true);
-  } else {
-    return static_cast<ur_result_t (*)(
-        ur_queue_handle_t, uint32_t, const ur_event_handle_t *,
-        ur_event_handle_t *, bool)>(EnqueueEventsWaitWithBarrier)(
-        Queue, NumEventsInWaitList, EventWaitList, OutEvent,
-        Queue->InterruptBasedEventsEnabled || false);
-  }
-}
-
 ur_result_t urEventGetInfo(
     ur_event_handle_t Event,  ///< [in] handle of the event object
     ur_event_info_t PropName, ///< [in] the name of the event property to query
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index d894b2ef4e..de018e7060 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -279,12 +279,6 @@ template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle);
 ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
                                   bool SetEventCompleted);
 
-ur_result_t EnqueueEventsWaitWithBarrier(ur_queue_handle_t Queue,
-                                         uint32_t NumEventsInWaitList,
-                                         const ur_event_handle_t *EventList,
-                                         ur_event_handle_t *OutEvent,
-                                         bool InterruptBasedEventsEnabled);
-
 // Get value of device scope events env var setting or default setting
 static const EventsScope DeviceEventsSetting = [] {
   char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS");

From 0abb51a987299c510205f9c4803e19a211af5790 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 19:22:30 +0000
Subject: [PATCH 066/148] [L0] Removed unnecessary if conditions in
 getEventFromContextCache

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/context.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 24dcff34ab..c4b9fc5687 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -579,27 +579,24 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
                     CounterBasedEventEnabled, InterruptBasedEventEnabled);
   if (Cache->empty()) {
     logger::info("Cache empty (Host Visible: {}, Profiling: {}, Counter: {}, "
-                 "Device: {})",
-                 HostVisible, WithProfiling, CounterBasedEventEnabled, Device);
+                 "Interrupt: {}, Device: {})",
+                 HostVisible, WithProfiling, CounterBasedEventEnabled,
+                 InterruptBasedEventEnabled, Device);
     return nullptr;
   }
 
   auto It = Cache->begin();
   ur_event_handle_t Event = *It;
-  if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) {
-    return nullptr;
-  }
-  if (Event->InterruptBasedEventsEnabled != InterruptBasedEventEnabled) {
-    return nullptr;
-  }
+
   Cache->erase(It);
   // We have to reset event before using it.
   Event->reset();
 
   logger::info("Using {} event (Host Visible: {}, Profiling: {}, Counter: {}, "
-               "Device: {}) from cache {}",
+               "Interrupt: {}, Device: {}) from cache {}",
                Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
-               Event->CounterBasedEventsEnabled, Device, Cache);
+               Event->CounterBasedEventsEnabled,
+               Event->InterruptBasedEventsEnabled, Cache);
 
   return Event;
 }

From 47ab4362ea2ae3449b0b3f8afe47d3ec8434a05a Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 20:45:45 +0000
Subject: [PATCH 067/148] [L0] changed bitwise or to logical or

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index be08d19f1a..098004195b 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -190,7 +190,7 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
 ) {
   bool InterruptBasedEventsEnabled =
       EnqueueExtProp
-          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS) |
+          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS) ||
                 Queue->InterruptBasedEventsEnabled
           : Queue->InterruptBasedEventsEnabled;
   // Lock automatically releases when this goes out of scope.

From cd42b01d873da0c67ee4f180e32042068f1958c1 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 21:00:35 +0000
Subject: [PATCH 068/148] [L0] changed bitwise or to logical or

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 098004195b..ef186d3e7b 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -189,10 +189,10 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
                   ///< this particular command instance.
 ) {
   bool InterruptBasedEventsEnabled =
-      EnqueueExtProp
-          ? (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS) ||
-                Queue->InterruptBasedEventsEnabled
-          : Queue->InterruptBasedEventsEnabled;
+      EnqueueExtProp ? (EnqueueExtProp->flags &
+                        UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS) ||
+                           Queue->InterruptBasedEventsEnabled
+                     : Queue->InterruptBasedEventsEnabled;
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
 

From 353d30678b2cb4670c11bd506705e5519143a394 Mon Sep 17 00:00:00 2001
From: "Gainullin, Artur" <artur.gainullin@intel.com>
Date: Tue, 3 Dec 2024 12:35:35 -0800
Subject: [PATCH 069/148] [L0] Shorten the dir name for the fecthed repo to
 avoid hitting Windows max limit

---
 cmake/FetchLevelZero.cmake | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/FetchLevelZero.cmake b/cmake/FetchLevelZero.cmake
index 3bc745f3d0..6d0e10d10d 100644
--- a/cmake/FetchLevelZero.cmake
+++ b/cmake/FetchLevelZero.cmake
@@ -104,14 +104,14 @@ if (UR_COMPUTE_RUNTIME_TAG STREQUAL "")
 set(UR_COMPUTE_RUNTIME_TAG 24.39.31294.12)
 endif()
 include(FetchContent)
-# Sparse fetch only the dir with level zero headers to avoid pulling in the entire compute-runtime.
-FetchContentSparse_Declare(compute-runtime-level-zero-headers ${UR_COMPUTE_RUNTIME_REPO} "${UR_COMPUTE_RUNTIME_TAG}" "level_zero/include")
-FetchContent_GetProperties(compute-runtime-level-zero-headers)
-if(NOT compute-runtime-level-zero-headers_POPULATED)
-  FetchContent_Populate(compute-runtime-level-zero-headers)
+# Sparse fetch only the dir with level zero headers for experimental features to avoid pulling in the entire compute-runtime.
+FetchContentSparse_Declare(exp-headers ${UR_COMPUTE_RUNTIME_REPO} "${UR_COMPUTE_RUNTIME_TAG}" "level_zero/include")
+FetchContent_GetProperties(exp-headers)
+if(NOT exp-headers_POPULATED)
+  FetchContent_Populate(exp-headers)
 endif()
 add_library(ComputeRuntimeLevelZero-Headers INTERFACE)
-set(COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE "${compute-runtime-level-zero-headers_SOURCE_DIR}/../..")
+set(COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE "${exp-headers_SOURCE_DIR}/../..")
 message(STATUS "Level Zero Adapter: Using Level Zero headers from ${COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE}")
 target_include_directories(ComputeRuntimeLevelZero-Headers
     INTERFACE "$<BUILD_INTERFACE:${COMPUTE_RUNTIME_LEVEL_ZERO_INCLUDE}>"

From 039cf144f535c0b1e125ae230e0f7fdc32659e81 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Tue, 3 Dec 2024 21:22:52 +0000
Subject: [PATCH 070/148] [L0] syntax fixes

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/context.cpp | 2 +-
 source/adapters/level_zero/context.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index c4b9fc5687..fd13dc35df 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -596,7 +596,7 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
                "Interrupt: {}, Device: {}) from cache {}",
                Event, Event->HostVisibleEvent, Event->isProfilingEnabled(),
                Event->CounterBasedEventsEnabled,
-               Event->InterruptBasedEventsEnabled, Cache);
+               Event->InterruptBasedEventsEnabled, Device, Cache);
 
   return Event;
 }
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 5a0549993a..43608e8bfc 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -375,9 +375,9 @@ struct ur_context_handle_t_ : _ur_object {
     EVENT_FLAG_WITH_PROFILING = UR_BIT(1),
     EVENT_FLAG_COUNTER = UR_BIT(2),
     EVENT_FLAG_INTERRUPT = UR_BIT(3),
-    EVENT_FLAG_DEVICE = UR_BIT(5), // if set, subsequent bits are device id
+    EVENT_FLAG_DEVICE = UR_BIT(4), // if set, subsequent bits are device id
     MAX_EVENT_FLAG_BITS =
-        6, // this is used as an offset for embedding device id
+        5, // this is used as an offset for embedding device id
   };
 
   // Mutex to control operations on event caches.

From 21684e684847752948bc4cd3d122bd282ad8ba53 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Tue, 3 Dec 2024 23:48:27 -0800
Subject: [PATCH 071/148] fix metadata with assert

---
 .../loader/layers/sanitizer/asan/asan_ddi.cpp |  41 +-
 .../sanitizer/asan/asan_interceptor.cpp       | 375 +++++++++---------
 .../sanitizer/asan/asan_interceptor.hpp       |  12 +-
 3 files changed, 211 insertions(+), 217 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 774ce3a61d..dca3f4bb05 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -52,12 +52,6 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
     return UR_RESULT_SUCCESS;
 }
 
-bool isInstrumentedKernel(ur_kernel_handle_t hKernel) {
-    auto hProgram = GetProgram(hKernel);
-    auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
-    return PI->isKernelInstrumented(hKernel);
-}
-
 } // namespace
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -465,12 +459,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     getContext()->logger.debug("==== urEnqueueKernelLaunch");
 
-    if (!isInstrumentedKernel(hKernel)) {
-        return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
-                               pGlobalWorkSize, pLocalWorkSize,
-                               numEventsInWaitList, phEventWaitList, phEvent);
-    }
-
     USMLaunchInfo LaunchInfo(GetContext(hKernel), GetDevice(hQueue),
                              pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
                              workDim);
@@ -1362,9 +1350,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate(
     getContext()->logger.debug("==== urKernelCreate");
 
     UR_CALL(pfnCreate(hProgram, pKernelName, phKernel));
-    if (isInstrumentedKernel(*phKernel)) {
-        UR_CALL(getAsanInterceptor()->insertKernel(*phKernel));
-    }
+    UR_CALL(getAsanInterceptor()->insertKernel(*phKernel));
 
     return UR_RESULT_SUCCESS;
 }
@@ -1385,9 +1371,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain(
     UR_CALL(pfnRetain(hKernel));
 
     auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
-    if (KernelInfo) {
-        KernelInfo->RefCount++;
-    }
+    KernelInfo->RefCount++;
 
     return UR_RESULT_SUCCESS;
 }
@@ -1407,10 +1391,8 @@ __urdlllocal ur_result_t urKernelRelease(
     UR_CALL(pfnRelease(hKernel));
 
     auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
-    if (KernelInfo) {
-        if (--KernelInfo->RefCount == 0) {
-            UR_CALL(getAsanInterceptor()->eraseKernel(hKernel));
-        }
+    if (--KernelInfo->RefCount == 0) {
+        UR_CALL(getAsanInterceptor()->eraseKernel(hKernel));
     }
 
     return UR_RESULT_SUCCESS;
@@ -1439,8 +1421,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
     std::shared_ptr<KernelInfo> KernelInfo;
     if (argSize == sizeof(ur_mem_handle_t) &&
         (MemBuffer = getAsanInterceptor()->getMemBuffer(
-             *ur_cast<const ur_mem_handle_t *>(pArgValue))) &&
-        (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) {
+             *ur_cast<const ur_mem_handle_t *>(pArgValue)))) {
+        auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
         KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
     } else {
@@ -1470,8 +1452,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
 
     std::shared_ptr<MemBuffer> MemBuffer;
     std::shared_ptr<KernelInfo> KernelInfo;
-    if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue)) &&
-        (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) {
+    if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) {
+        auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
         KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
     } else {
@@ -1501,7 +1483,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
         "==== urKernelSetArgLocal (argIndex={}, argSize={})", argIndex,
         argSize);
 
-    if (auto KI = getAsanInterceptor()->getKernelInfo(hKernel)) {
+    {
+        auto KI = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
         // TODO: get local variable alignment
         auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal(
@@ -1538,8 +1521,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer(
         pArgValue);
 
     std::shared_ptr<KernelInfo> KI;
-    if (getAsanInterceptor()->getOptions().DetectKernelArguments &&
-        (KI = getAsanInterceptor()->getKernelInfo(hKernel))) {
+    if (getAsanInterceptor()->getOptions().DetectKernelArguments) {
+        auto KI = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
         KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()};
     }
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 271d846990..edfd200167 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -255,9 +255,6 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
     auto ContextInfo = getContextInfo(Context);
     auto DeviceInfo = getDeviceInfo(Device);
     auto KernelInfo = getKernelInfo(Kernel);
-    assert(KernelInfo && "Kernel should be instrumented");
-
-    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
 
     ManagedQueue InternalQueue(Context, Device);
     if (!InternalQueue) {
@@ -648,7 +645,13 @@ ur_result_t AsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) {
     if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
         return UR_RESULT_SUCCESS;
     }
-    m_KernelMap.emplace(Kernel, std::make_shared<KernelInfo>(Kernel));
+
+    auto hProgram = GetProgram(Kernel);
+    auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
+    bool IsInstrumented = PI->isKernelInstrumented(Kernel);
+
+    m_KernelMap.emplace(Kernel,
+                        std::make_shared<KernelInfo>(Kernel, IsInstrumented));
     return UR_RESULT_SUCCESS;
 }
 
@@ -689,204 +692,210 @@ ur_result_t AsanInterceptor::prepareLaunch(
     std::shared_ptr<ContextInfo> &ContextInfo,
     std::shared_ptr<DeviceInfo> &DeviceInfo, ur_queue_handle_t Queue,
     ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) {
+    auto KernelInfo = getKernelInfo(Kernel);
 
-    do {
-        auto KernelInfo = getKernelInfo(Kernel);
-        assert(KernelInfo && "Kernel should be instrumented");
-
-        // Validate pointer arguments
-        if (getOptions().DetectKernelArguments) {
-            for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
-                auto Ptr = PtrPair.first;
-                if (Ptr == nullptr) {
-                    continue;
-                }
-                if (auto ValidateResult = ValidateUSMPointer(
-                        ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
-                    ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
-                                                ValidateResult, PtrPair.second);
-                    exitWithErrors();
-                }
+    auto ArgNums = GetKernelNumArgs(Kernel);
+    auto LocalMemoryUsage =
+        GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
+    auto PrivateMemoryUsage =
+        GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
+
+    getContext()->logger.info(
+        "KernelInfo {} (Name={}, ArgNums={}, IsInstrumented={}, "
+        "LocalMemory={}, PrivateMemory={})",
+        (void *)Kernel, GetKernelName(Kernel), ArgNums,
+        KernelInfo->IsInstrumented, LocalMemoryUsage, PrivateMemoryUsage);
+
+    // Validate pointer arguments
+    if (getOptions().DetectKernelArguments) {
+        for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
+            auto Ptr = PtrPair.first;
+            if (Ptr == nullptr) {
+                continue;
             }
-        }
-
-        // Set membuffer arguments
-        for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
-            char *ArgPointer = nullptr;
-            UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgIndex, nullptr, ArgPointer);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error(
-                    "Failed to set buffer {} as the {} arg to kernel {}: {}",
-                    ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
-                    URes);
+            if (auto ValidateResult = ValidateUSMPointer(
+                    ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
+                ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
+                                            ValidateResult, PtrPair.second);
+                exitWithErrors();
             }
         }
+    }
 
-        // Set launch info argument
-        auto ArgNums = GetKernelNumArgs(Kernel);
-        if (ArgNums) {
-            getContext()->logger.debug(
-                "launch_info {} (numLocalArgs={}, localArgs={})",
-                (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs,
-                (void *)LaunchInfo.Data->LocalArgs);
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgNums - 1, nullptr, LaunchInfo.Data);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error("Failed to set launch info: {}",
-                                           URes);
-                return URes;
-            }
+    // Set membuffer arguments
+    for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+        char *ArgPointer = nullptr;
+        UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgIndex, nullptr, ArgPointer);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                URes);
+            return URes;
         }
+    }
 
-        LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
-        LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
-        LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
-        LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
-
-        if (LaunchInfo.LocalWorkSize.empty()) {
-            LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-            auto URes =
-                getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-                    Kernel, Queue, LaunchInfo.WorkDim,
-                    LaunchInfo.GlobalWorkOffset, LaunchInfo.GlobalWorkSize,
-                    LaunchInfo.LocalWorkSize.data());
-            if (URes != UR_RESULT_SUCCESS) {
-                if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
-                    return URes;
-                }
-                // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
-                // to inefficient implementation
-                for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-                    LaunchInfo.LocalWorkSize[Dim] = 1;
-                }
-            }
-        }
+    if (!KernelInfo->IsInstrumented) {
+        return UR_RESULT_SUCCESS;
+    }
 
-        const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
-        uint32_t NumWG = 1;
-        for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-            NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
-                     LocalWorkSize[Dim];
+    // Set launch info argument
+    {
+        assert(ArgNums >= 1 &&
+               "Sanitized Kernel should have at least one argument");
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to set launch info: {}", URes);
+            return URes;
         }
+    }
 
-        auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
-                                            Device = DeviceInfo->Handle,
-                                            Queue](size_t Size, uptr &Ptr) {
-            void *Allocated = nullptr;
-            auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
-                Context, Device, nullptr, nullptr, Size, &Allocated);
-            if (URes != UR_RESULT_SUCCESS) {
+    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
+
+    LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
+    LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+    LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
+    LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
+
+    getContext()->logger.info(
+        "LaunchInfo {} (device={}, debug={}, numLocalArgs={}, localArgs={})",
+        (void *)LaunchInfo.Data, ToString(LaunchInfo.Data->DeviceTy),
+        LaunchInfo.Data->Debug, LaunchInfo.Data->NumLocalArgs,
+        (void *)LaunchInfo.Data->LocalArgs);
+
+    // urKernelGetSuggestedLocalWorkSize must be called after urKernelSetArgPointer
+    if (LaunchInfo.LocalWorkSize.empty()) {
+        LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
+        auto URes =
+            getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
+                Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset,
+                LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+        if (URes != UR_RESULT_SUCCESS) {
+            if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
                 return URes;
             }
-            // Initialize shadow memory
-            URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
-            if (URes != UR_RESULT_SUCCESS) {
-                [[maybe_unused]] auto URes =
-                    getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
-                assert(URes == UR_RESULT_SUCCESS &&
-                       "urUSMFree failed at allocating shadow memory");
-                Allocated = nullptr;
+            // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
+            // to inefficient implementation
+            for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+                LaunchInfo.LocalWorkSize[Dim] = 1;
             }
-            Ptr = (uptr)Allocated;
+        }
+    }
+
+    const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
+    uint32_t NumWG = 1;
+    for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+        NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
+                 LocalWorkSize[Dim];
+    }
+
+    auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
+                                        Device = DeviceInfo->Handle,
+                                        Queue](size_t Size, uptr &Ptr) {
+        void *Allocated = nullptr;
+        auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, Size, &Allocated);
+        if (URes != UR_RESULT_SUCCESS) {
             return URes;
-        };
-
-        auto LocalMemoryUsage =
-            GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
-        auto PrivateMemoryUsage =
-            GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
-
-        getContext()->logger.info(
-            "KernelInfo {} (LocalMemory={}, PrivateMemory={})", (void *)Kernel,
-            LocalMemoryUsage, PrivateMemoryUsage);
-
-        // Write shadow memory offset for local memory
-        if (getOptions().DetectLocals) {
-            // CPU needn't this
-            if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t LocalMemorySize =
-                    GetDeviceLocalMemorySize(DeviceInfo->Handle);
-                const size_t LocalShadowMemorySize =
-                    (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug(
-                    "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
-                    "LocalShadowMemorySize={})",
-                    NumWG, LocalMemorySize, LocalShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        LocalShadowMemorySize,
-                        LaunchInfo.Data->LocalShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for local "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking local memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->LocalShadowOffsetEnd =
-                        LaunchInfo.Data->LocalShadowOffset +
-                        LocalShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        LocalShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Local, {} - {})",
-                        (void *)LaunchInfo.Data->LocalShadowOffset,
-                        (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
-                }
+        }
+        // Initialize shadow memory
+        URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
+        if (URes != UR_RESULT_SUCCESS) {
+            [[maybe_unused]] auto URes =
+                getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
+            assert(URes == UR_RESULT_SUCCESS &&
+                   "urUSMFree failed at allocating shadow memory");
+            Allocated = nullptr;
+        }
+        Ptr = (uptr)Allocated;
+        return URes;
+    };
+
+    // Write shadow memory offset for local memory
+    if (getOptions().DetectLocals) {
+        // CPU needn't this
+        if (DeviceInfo->Type == DeviceType::GPU_PVC ||
+            DeviceInfo->Type == DeviceType::GPU_DG2) {
+            const size_t LocalMemorySize =
+                GetDeviceLocalMemorySize(DeviceInfo->Handle);
+            const size_t LocalShadowMemorySize =
+                (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
+
+            getContext()->logger.debug(
+                "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
+                "LocalShadowMemorySize={})",
+                NumWG, LocalMemorySize, LocalShadowMemorySize);
+
+            if (EnqueueAllocateShadowMemory(
+                    LocalShadowMemorySize,
+                    LaunchInfo.Data->LocalShadowOffset) != UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for local "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking local memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                LaunchInfo.Data->LocalShadowOffsetEnd =
+                    LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize -
+                    1;
+
+                ContextInfo->Stats.UpdateShadowMalloced(LocalShadowMemorySize);
+
+                getContext()->logger.info(
+                    "ShadowMemory(Local, {} - {})",
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
+    }
 
-        // Write shadow memory offset for private memory
-        if (getOptions().DetectPrivates) {
-            if (DeviceInfo->Type == DeviceType::CPU) {
-                LaunchInfo.Data->PrivateShadowOffset =
-                    DeviceInfo->Shadow->ShadowBegin;
-            } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                       DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t PrivateShadowMemorySize =
-                    (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
-                                           "PrivateShadowMemorySize={})",
-                                           NumWG, PrivateShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        PrivateShadowMemorySize,
-                        LaunchInfo.Data->PrivateShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for private "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking private memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->PrivateShadowOffsetEnd =
-                        LaunchInfo.Data->PrivateShadowOffset +
-                        PrivateShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        PrivateShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Private, {} - {})",
-                        (void *)LaunchInfo.Data->PrivateShadowOffset,
-                        (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
-                }
+    // Write shadow memory offset for private memory
+    if (getOptions().DetectPrivates) {
+        if (DeviceInfo->Type == DeviceType::CPU) {
+            LaunchInfo.Data->PrivateShadowOffset =
+                DeviceInfo->Shadow->ShadowBegin;
+        } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
+                   DeviceInfo->Type == DeviceType::GPU_DG2) {
+            const size_t PrivateShadowMemorySize =
+                (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
+
+            getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
+                                       "PrivateShadowMemorySize={})",
+                                       NumWG, PrivateShadowMemorySize);
+
+            if (EnqueueAllocateShadowMemory(
+                    PrivateShadowMemorySize,
+                    LaunchInfo.Data->PrivateShadowOffset) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for private "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking private memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                LaunchInfo.Data->PrivateShadowOffsetEnd =
+                    LaunchInfo.Data->PrivateShadowOffset +
+                    PrivateShadowMemorySize - 1;
+
+                ContextInfo->Stats.UpdateShadowMalloced(
+                    PrivateShadowMemorySize);
+
+                getContext()->logger.info(
+                    "ShadowMemory(Private, {} - {})",
+                    (void *)LaunchInfo.Data->PrivateShadowOffset,
+                    (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
             }
         }
-    } while (false);
+    }
 
     return UR_RESULT_SUCCESS;
 }
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index 926be1388e..d24be1e1f2 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -85,6 +85,9 @@ struct KernelInfo {
     ur_kernel_handle_t Handle;
     std::atomic<int32_t> RefCount = 1;
 
+    // sanitized kernel
+    bool IsInstrumented = false;
+
     // lock this mutex if following fields are accessed
     ur_shared_mutex Mutex;
     std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
@@ -94,7 +97,8 @@ struct KernelInfo {
     // Need preserve the order of local arguments
     std::map<uint32_t, LocalArgsInfo> LocalArgs;
 
-    explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) {
+    explicit KernelInfo(ur_kernel_handle_t Kernel, bool IsInstrumented)
+        : Handle(Kernel), IsInstrumented(IsInstrumented) {
         [[maybe_unused]] auto Result =
             getContext()->urDdiTable.Kernel.pfnRetain(Kernel);
         assert(Result == UR_RESULT_SUCCESS);
@@ -272,10 +276,8 @@ class AsanInterceptor {
 
     std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
         std::shared_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
-        if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
-            return m_KernelMap[Kernel];
-        }
-        return nullptr;
+        assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+        return m_KernelMap[Kernel];
     }
 
     const AsanOptions &getOptions() { return m_Options; }

From 72b5730061b86024078e9440c7e645e7a3904c24 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Wed, 4 Dec 2024 10:48:43 +0000
Subject: [PATCH 072/148] Add tensormap stubs to L0 v2

---
 source/adapters/level_zero/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index 4e81bbd738..cb7e0281af 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -140,6 +140,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
         # v2-only sources
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp

From cd8b01c888873d6630a972a1e186c331d4310a1d Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 4 Dec 2024 07:48:46 -0800
Subject: [PATCH 073/148] [L0] Disabling Driver In Order Lists by default

- Due to L0 Driver issues related to performance using driver in order
  lists, the feature is being disabled for the time being.
- Once issues with the L0 Drivers implementing this feature are
  resolved, then this feature will be re-enabled.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/device.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 99bb20d31a..9bc15671bd 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1518,10 +1518,10 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
 
   static const bool UseDriverInOrderLists = [&] {
     const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
-    bool CompatibleDriver = this->Platform->isDriverVersionNewerOrSimilar(
-        1, 3, L0_DRIVER_INORDER_MIN_VERSION);
+    // bool CompatibleDriver = this->Platform->isDriverVersionNewerOrSimilar(
+    //     1, 3, L0_DRIVER_INORDER_MIN_VERSION);
     if (!UrRet)
-      return CompatibleDriver;
+      return false;
     return std::atoi(UrRet) != 0;
   }();
 

From b371894ffa53a7935db33b8a667dab011bfe0fed Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Wed, 4 Dec 2024 17:38:08 +0000
Subject: [PATCH 074/148] Add rocm path to find_package

When installed via deb, rocm is installed into `/opt/rocm` and wasn't
included in $PATH. Include this directory as a possible location to
find `ROCM_AGENT_ENUMERATOR`.
---
 cmake/FindRocmAgentEnumerator.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/FindRocmAgentEnumerator.cmake b/cmake/FindRocmAgentEnumerator.cmake
index 6c32410265..b966d8fd9d 100644
--- a/cmake/FindRocmAgentEnumerator.cmake
+++ b/cmake/FindRocmAgentEnumerator.cmake
@@ -9,7 +9,7 @@
 #                                  rocm_agent_enumerator is found.
 #
 
-find_program(ROCM_AGENT_ENUMERATOR NAMES rocm_agent_enumerator)
+find_program(ROCM_AGENT_ENUMERATOR NAMES rocm_agent_enumerator PATHS /opt/rocm/bin)
 
 if(ROCM_AGENT_ENUMERATOR)
     set(ROCM_AGENT_ENUMERATOR_FOUND TRUE)

From 1ee677464cc05bb022b1ca4d78737bf58e704978 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 4 Dec 2024 09:32:58 -0800
Subject: [PATCH 075/148] [L0] Disable driver in order in command buffer

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/command_buffer.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index eccdc5e4d2..bfe00baa60 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -598,12 +598,16 @@ ur_result_t createMainCommandList(ur_context_handle_t Context,
  */
 bool canBeInOrder(ur_context_handle_t Context,
                   const ur_exp_command_buffer_desc_t *CommandBufferDesc) {
+  std::ignore = Context;
+  std::ignore = CommandBufferDesc;
   // In-order command-lists are not available in old driver version.
-  bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar(
-      1, 3, L0_DRIVER_INORDER_MIN_VERSION);
-  return CompatibleDriver
-             ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
-             : false;
+  // bool CompatibleDriver =
+  // Context->getPlatform()->isDriverVersionNewerOrSimilar(
+  //     1, 3, L0_DRIVER_INORDER_MIN_VERSION);
+  // return CompatibleDriver
+  //            ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
+  //            : false;
+  return false;
 }
 
 /**

From 447b638bcdaf8e747cad48095481bf0ff07c8512 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 4 Dec 2024 10:13:51 -0800
Subject: [PATCH 076/148] [L0] Enable Command Buffer usage of
 UR_L0_USE_DRIVER_INORDER_LISTS

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/command_buffer.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index bfe00baa60..4a8688628f 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -598,16 +598,15 @@ ur_result_t createMainCommandList(ur_context_handle_t Context,
  */
 bool canBeInOrder(ur_context_handle_t Context,
                   const ur_exp_command_buffer_desc_t *CommandBufferDesc) {
-  std::ignore = Context;
-  std::ignore = CommandBufferDesc;
+  const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
   // In-order command-lists are not available in old driver version.
-  // bool CompatibleDriver =
-  // Context->getPlatform()->isDriverVersionNewerOrSimilar(
-  //     1, 3, L0_DRIVER_INORDER_MIN_VERSION);
-  // return CompatibleDriver
-  //            ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
-  //            : false;
-  return false;
+  bool DriverInOrderRequested = UrRet ? std::atoi(UrRet) != 0 : false;
+  bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar(
+      1, 3, L0_DRIVER_INORDER_MIN_VERSION);
+  bool CanUseDriverInOrderLists = CompatibleDriver && DriverInOrderRequested;
+  return CanUseDriverInOrderLists
+             ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
+             : false;
 }
 
 /**

From 04482111acc8c091e7bb347736899df491e5baf8 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 5 Dec 2024 11:34:55 +0000
Subject: [PATCH 077/148] Disable cfi sanitizer

This has caused a number of problems, so is disabled by default now. It
can be re-enabled using the cmake var `UR_USE_CFI`.

Issues are tracked internally by Intel as CMPLRLLVM-63862.
---
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcbf0d7988..c3330ce31b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ option(UR_USE_ASAN "enable AddressSanitizer" OFF)
 option(UR_USE_UBSAN "enable UndefinedBehaviorSanitizer" OFF)
 option(UR_USE_MSAN "enable MemorySanitizer" OFF)
 option(UR_USE_TSAN "enable ThreadSanitizer" OFF)
-option(UR_USE_CFI "enable Control Flow Integrity checks (requires clang and implies -flto)" ON)
+option(UR_USE_CFI "enable Control Flow Integrity checks (requires clang and implies -flto)" OFF)
 option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF)
 option(UR_ENABLE_SANITIZER "enable device sanitizer" ON)
 option(UR_ENABLE_SYMBOLIZER "enable symoblizer for sanitizer" OFF)
diff --git a/README.md b/README.md
index 9f4eeef5ae..459b75398a 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,7 @@ List of options provided by CMake:
 | UR_USE_TSAN | Enable ThreadSanitizer | ON/OFF | OFF |
 | UR_USE_UBSAN | Enable UndefinedBehavior Sanitizer | ON/OFF | OFF |
 | UR_USE_MSAN | Enable MemorySanitizer (clang only) | ON/OFF | OFF |
-| UR_USE_CFI | Enable Control Flow Integrity checks (clang only, also enables lto) | ON/OFF | ON |
+| UR_USE_CFI | Enable Control Flow Integrity checks (clang only, also enables lto) | ON/OFF | OFF |
 | UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF |
 | UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON |
 | UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 |

From 323b37c555042ac1b887b9a08a1281bc207ee0b7 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Mon, 25 Nov 2024 17:08:07 +0000
Subject: [PATCH 078/148] Fix command_buffer coverity issues

- Change command handle constructors to accept const ref for vector types
- Add std::move to certain vector assignments
- Add missing call to store command handle in command buffer in urCommandBufferAppendMemBufferCopyExp
---
 source/adapters/cuda/command_buffer.cpp | 29 +++++++++++++------------
 source/adapters/cuda/command_buffer.hpp | 26 +++++++++++-----------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 4b4b2cffe5..b60d2944b1 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -105,7 +105,7 @@ ur_result_t ur_exp_command_buffer_handle_t_::addWaitNodes(
   }
   // Set DepsLists as an output parameter for communicating the list of wait
   // nodes created.
-  DepsList = WaitNodes;
+  DepsList = std::move(WaitNodes);
   return UR_RESULT_SUCCESS;
 }
 
@@ -115,7 +115,7 @@ kernel_command_handle::kernel_command_handle(
     const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
     const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives,
     ur_kernel_handle_t *KernelAlternatives, CUgraphNode SignalNode,
-    std::vector<CUgraphNode> WaitNodes)
+    const std::vector<CUgraphNode> &WaitNodes)
     : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                               WaitNodes),
       Kernel(Kernel), Params(Params), WorkDim(WorkDim) {
@@ -146,7 +146,7 @@ kernel_command_handle::kernel_command_handle(
 ur_exp_command_buffer_command_handle_t_::
     ur_exp_command_buffer_command_handle_t_(
         ur_exp_command_buffer_handle_t CommandBuffer, CUgraphNode Node,
-        CUgraphNode SignalNode, std::vector<CUgraphNode> WaitNodes)
+        CUgraphNode SignalNode, const std::vector<CUgraphNode> &WaitNodes)
     : CommandBuffer(CommandBuffer), Node(Node), SignalNode(SignalNode),
       WaitNodes(WaitNodes), RefCountInternal(1), RefCountExternal(1) {
   CommandBuffer->incrementInternalReferenceCount();
@@ -339,7 +339,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        NumEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        NumEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes);
     CommandBuffer->CommandHandles.push_back(NewCommand);
 
@@ -537,7 +537,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new kernel_command_handle(
         hCommandBuffer, hKernel, GraphNode, NodeParams, workDim,
         pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
@@ -595,7 +595,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new usm_memcpy_command_handle(hCommandBuffer, GraphNode,
                                                     SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -663,9 +663,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_copy_command_handle(hCommandBuffer, GraphNode,
                                                      SignalNode, WaitNodes);
+    hCommandBuffer->CommandHandles.push_back(NewCommand);
 
     if (phCommand) {
       NewCommand->incrementInternalReferenceCount();
@@ -727,7 +728,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_copy_rect_command_handle(
         hCommandBuffer, GraphNode, SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -788,7 +789,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_write_command_handle(hCommandBuffer, GraphNode,
                                                       SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -848,7 +849,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_read_command_handle(hCommandBuffer, GraphNode,
                                                      SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -913,7 +914,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_write_rect_command_handle(
         hCommandBuffer, GraphNode, SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -978,7 +979,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new buffer_read_rect_command_handle(
         hCommandBuffer, GraphNode, SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -1034,7 +1035,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new usm_prefetch_command_handle(hCommandBuffer, GraphNode,
                                                       SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -1090,7 +1091,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
     }
 
     std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? DepsList : std::vector<CUgraphNode>();
+        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
     auto NewCommand = new usm_advise_command_handle(hCommandBuffer, GraphNode,
                                                     SignalNode, WaitNodes);
     hCommandBuffer->CommandHandles.push_back(NewCommand);
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index c82409104f..d2403a4ab3 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -56,7 +56,7 @@ enum class CommandType {
 struct ur_exp_command_buffer_command_handle_t_ {
   ur_exp_command_buffer_command_handle_t_(
       ur_exp_command_buffer_handle_t CommandBuffer, CUgraphNode Node,
-      CUgraphNode SignalNode, std::vector<CUgraphNode> WaitNodes);
+      CUgraphNode SignalNode, const std::vector<CUgraphNode> &WaitNodes);
 
   virtual ~ur_exp_command_buffer_command_handle_t_() {}
 
@@ -102,7 +102,7 @@ struct kernel_command_handle : ur_exp_command_buffer_command_handle_t_ {
       const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
       const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives,
       ur_kernel_handle_t *KernelAlternatives, CUgraphNode SignalNode,
-      std::vector<CUgraphNode> WaitNodes);
+      const std::vector<CUgraphNode> &WaitNodes);
 
   CommandType getCommandType() const noexcept override {
     return CommandType::Kernel;
@@ -161,7 +161,7 @@ struct kernel_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct usm_memcpy_command_handle : ur_exp_command_buffer_command_handle_t_ {
   usm_memcpy_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                             CUgraphNode Node, CUgraphNode SignalNode,
-                            std::vector<CUgraphNode> WaitNodes)
+                            const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -172,7 +172,7 @@ struct usm_memcpy_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct usm_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
   usm_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                           CUgraphNode Node, CUgraphNode SignalNode,
-                          std::vector<CUgraphNode> WaitNodes)
+                          const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -183,7 +183,7 @@ struct usm_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct buffer_copy_command_handle : ur_exp_command_buffer_command_handle_t_ {
   buffer_copy_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                              CUgraphNode Node, CUgraphNode SignalNode,
-                             std::vector<CUgraphNode> WaitNodes)
+                             const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -195,7 +195,7 @@ struct buffer_copy_rect_command_handle
     : ur_exp_command_buffer_command_handle_t_ {
   buffer_copy_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                                   CUgraphNode Node, CUgraphNode SignalNode,
-                                  std::vector<CUgraphNode> WaitNodes)
+                                  const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -206,7 +206,7 @@ struct buffer_copy_rect_command_handle
 struct buffer_read_command_handle : ur_exp_command_buffer_command_handle_t_ {
   buffer_read_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                              CUgraphNode Node, CUgraphNode SignalNode,
-                             std::vector<CUgraphNode> WaitNodes)
+                             const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -218,7 +218,7 @@ struct buffer_read_rect_command_handle
     : ur_exp_command_buffer_command_handle_t_ {
   buffer_read_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                                   CUgraphNode Node, CUgraphNode SignalNode,
-                                  std::vector<CUgraphNode> WaitNodes)
+                                  const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -229,7 +229,7 @@ struct buffer_read_rect_command_handle
 struct buffer_write_command_handle : ur_exp_command_buffer_command_handle_t_ {
   buffer_write_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                               CUgraphNode Node, CUgraphNode SignalNode,
-                              std::vector<CUgraphNode> WaitNodes)
+                              const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -241,7 +241,7 @@ struct buffer_write_rect_command_handle
     : ur_exp_command_buffer_command_handle_t_ {
   buffer_write_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                                    CUgraphNode Node, CUgraphNode SignalNode,
-                                   std::vector<CUgraphNode> WaitNodes)
+                                   const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -252,7 +252,7 @@ struct buffer_write_rect_command_handle
 struct buffer_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
   buffer_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                              CUgraphNode Node, CUgraphNode SignalNode,
-                             std::vector<CUgraphNode> WaitNodes)
+                             const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -263,7 +263,7 @@ struct buffer_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct usm_prefetch_command_handle : ur_exp_command_buffer_command_handle_t_ {
   usm_prefetch_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                               CUgraphNode Node, CUgraphNode SignalNode,
-                              std::vector<CUgraphNode> WaitNodes)
+                              const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {
@@ -274,7 +274,7 @@ struct usm_prefetch_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct usm_advise_command_handle : ur_exp_command_buffer_command_handle_t_ {
   usm_advise_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                             CUgraphNode Node, CUgraphNode SignalNode,
-                            std::vector<CUgraphNode> WaitNodes)
+                            const std::vector<CUgraphNode> &WaitNodes)
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
                                                 WaitNodes) {}
   CommandType getCommandType() const noexcept override {

From ea214abc01634af765272542d7ca30b3967e4879 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 5 Dec 2024 13:48:24 +0100
Subject: [PATCH 079/148] [benchmarks] add stddev calculation and outlier
 elimitation

We now run benchmarks until the result is stabilized below a
threshold stddev value or until we reach max number of iterations.
Outlier results are eliminated each time we calculate stddev,
this is to help stddev stabilize quicker and minimize the number
of repeat runs.
---
 scripts/benchmarks/benches/base.py     |   3 -
 scripts/benchmarks/benches/llamacpp.py |   3 -
 scripts/benchmarks/benches/options.py  |   5 +-
 scripts/benchmarks/benches/result.py   |   1 +
 scripts/benchmarks/main.py             | 131 +++++++++++++++++++------
 5 files changed, 104 insertions(+), 39 deletions(-)

diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 4356fb0d96..31f2054d9a 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -71,9 +71,6 @@ def run(self, env_vars) -> list[Result]:
     def teardown(self):
         raise NotImplementedError()
 
-    def ignore_iterations(self):
-        return False
-
 class Suite:
     def benchmarks(self) -> list[Benchmark]:
         raise NotImplementedError()
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 4a260a09cc..2dbdb5cbcf 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -76,9 +76,6 @@ def name(self):
     def lower_is_better(self):
         return False
 
-    def ignore_iterations(self):
-        return True
-
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index 03b0db7128..fa5d52ca8c 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -15,13 +15,16 @@ class Options:
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
-    iterations: int = 5
+    iterations: int = 3
     verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10 # average/median over how many results
     output_html: bool = False
     output_markdown: bool = True
     dry_run: bool = False
+    # these two should probably be merged into one setting
+    stddev_threshold: float = 0.02
+    epsilon: float = 0.02
 
 options = Options()
 
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 7d40040607..336039c342 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -23,6 +23,7 @@ class Result:
     lower_is_better: bool = True
     git_hash: str = ''
     date: Optional[datetime] = None
+    stddev: float = 0.0
 
 @dataclass_json
 @dataclass
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index bca0f01553..6a2f8a8273 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -18,10 +18,97 @@
 
 import argparse
 import re
+import statistics
 
 # Update this if you are changing the layout of the results files
 INTERNAL_WORKDIR_VERSION = '2.0'
 
+def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]):
+    for iter in range(iters):
+        print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
+        bench_results = benchmark.run(env_vars)
+        if bench_results is None:
+            print(f"did not finish (OK for sycl-bench).")
+            break
+
+        for bench_result in bench_results:
+            # TODO: report failures in markdown/html ?
+            if not bench_result.passed:
+                print(f"complete ({bench_result.label}: verification FAILED)")
+                continue
+
+            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
+
+            bench_result.name = bench_result.label
+            bench_result.lower_is_better = benchmark.lower_is_better()
+
+            if bench_result.label not in results:
+                results[bench_result.label] = []
+
+            results[bench_result.label].append(bench_result)
+
+# https://www.statology.org/modified-z-score/
+def modified_z_score(values: list[float]) -> list[float]:
+    median = statistics.median(values)
+    mad = statistics.median([abs(v - median) for v in values])
+    if mad == 0:
+        return [0] * len(values)
+    return [(0.6745 * (v - median)) / mad for v in values]
+
+def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) -> dict[str, list[Result]]:
+    new_results = {}
+    for key, rlist in results.items():
+        # don't eliminate outliers on first pass
+        if len(rlist) <= options.iterations:
+            new_results[key] = rlist
+            continue
+
+        values = [r.value for r in rlist]
+        z_scores = modified_z_score(values)
+        filtered_rlist = [r for r, z in zip(rlist, z_scores) if abs(z) <= threshold]
+
+        if not filtered_rlist:
+            new_results[key] = rlist
+        else:
+            new_results[key] = filtered_rlist
+
+    return new_results
+
+def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
+    processed: list[Result] = []
+    # technically, we can detect whether result is below or above threshold per
+    # individual result. However, we can't repeat benchmark runs with that
+    # granularity. So we just reject all results and try again.
+    valid_results = True # above stddev threshold
+
+    for label, rlist in remove_outliers(results).items():
+        if (len(rlist) == 0):
+            continue
+
+        if len(rlist) == 1:
+            processed.append(rlist[0])
+            continue
+
+        values = [r.value for r in rlist]
+
+        mean_value = statistics.mean(values)
+        stddev = statistics.stdev(values)
+
+        threshold = options.stddev_threshold * mean_value
+
+        if stddev > threshold:
+            print(f"stddev {stddev} above the threshold {threshold} for {label}")
+            valid_results = False
+
+        rlist.sort(key=lambda res: res.value)
+        median_index = len(rlist) // 2
+        median_result = rlist[median_index]
+        median_result.stddev = stddev
+
+        processed.append(median_result)
+
+    return valid_results, processed
+
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
@@ -65,36 +152,15 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     for benchmark in benchmarks:
         try:
             merged_env_vars = {**additional_env_vars}
-            iteration_results = []
-            iterations = options.iterations if not benchmark.ignore_iterations() else 1
-            for iter in range(iterations):
-                print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
-                bench_results = benchmark.run(merged_env_vars)
-                if bench_results is not None:
-                    for bench_result in bench_results:
-                        if bench_result.passed:
-                            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
-                        else:
-                            print(f"complete ({bench_result.label}: verification FAILED)")
-                        iteration_results.append(bench_result)
-                else:
-                    print(f"did not finish (OK for sycl-bench).")
+            intermediate_results: dict[str, list[Result]] = {}
+            processed: list[Result] = []
+            for _ in range(5):
+                run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
+                valid, processed = process_results(intermediate_results)
+                if valid:
                     break
+            results += processed
 
-            if len(iteration_results) == 0:
-                continue
-
-            for label in set([result.label for result in iteration_results]):
-                label_results = [result for result in iteration_results if result.label == label and result.passed == True]
-                if len(label_results) > 0:
-                    label_results.sort(key=lambda res: res.value)
-                    median_index = len(label_results) // 2
-                    median_result = label_results[median_index]
-
-                    median_result.name = label
-                    median_result.lower_is_better = benchmark.lower_is_better()
-
-                    results.append(median_result)
         except Exception as e:
             if options.exit_on_failure:
                 raise e
@@ -164,14 +230,15 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
     parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
-    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
-    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
+    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
+    parser.add_argument("--stddev-threshold", type=float, help='If stddev % is above this threshold, rerun all iterations', default=options.stddev_threshold)
+    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
     parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
-    parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
+    parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=options.epsilon)
     parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
     parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
     parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
-    parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
+    parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
     parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
     parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
     parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)

From 8a23597bba1ad66717317b0b5dd45a642670c817 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 5 Dec 2024 15:09:42 +0100
Subject: [PATCH 080/148] improve html output

This removes the bar charts, since they were complicated to create and
not really useful. The timeline charts are also now drawn individually
and are filterable.
---
 scripts/benchmarks/output_html.py | 265 +++++-------------------------
 1 file changed, 44 insertions(+), 221 deletions(-)

diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 4a04252797..6233ff900a 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -9,7 +9,6 @@
 from collections import defaultdict
 from dataclasses import dataclass
 import matplotlib.dates as mdates
-import numpy as np
 from benches.result import BenchmarkRun, Result
 
 @dataclass
@@ -24,220 +23,21 @@ class BenchmarkSeries:
     runs: list[BenchmarkRun]
 
 @dataclass
-class LatestResults:
-    benchmark_label: str
-    run_values: dict[str, float]
-
-    @classmethod
-    def from_dict(cls, label: str, values: dict[str, float]) -> 'LatestResults':
-        return cls(benchmark_label=label, run_values=values)
-
-def get_latest_results(benchmarks: list[BenchmarkSeries]) -> dict[str, LatestResults]:
-    latest_results: dict[str, LatestResults] = {}
-    for benchmark in benchmarks:
-        run_values = {
-            run.name: max(run.results, key=lambda x: x.date).value
-            for run in benchmark.runs
-        }
-        latest_results[benchmark.label] = LatestResults.from_dict(benchmark.label, run_values)
-    return latest_results
-
-def prepare_normalized_data(latest_results: dict[str, LatestResults], 
-                          benchmarks: list[BenchmarkSeries],
-                          group_benchmarks: list[str],
-                          non_baseline_runs: list[str],
-                          baseline_name: str) -> list[list[float]]:
-    normalized_data = []
-    benchmark_map = {b.label: b for b in benchmarks}
-
-    for run_name in non_baseline_runs:
-        run_data: list[float] = []
-        for benchmark_label in group_benchmarks:
-            benchmark_data = latest_results[benchmark_label].run_values
-            if run_name not in benchmark_data or baseline_name not in benchmark_data:
-                run_data.append(None)
-                continue
-
-            baseline_value = benchmark_data[baseline_name]
-            current_value = benchmark_data[run_name]
-
-            normalized_value = ((baseline_value / current_value) if benchmark_map[benchmark_label].metadata.lower_is_better
-                              else (current_value / baseline_value)) * 100
-            run_data.append(normalized_value)
-        normalized_data.append(run_data)
-    return normalized_data
-
-def format_benchmark_label(label: str) -> list[str]:
-    words = re.split(' |_', label)
-    lines = []
-    current_line = []
-
-    # max line length 30
-    for word in words:
-        if len(' '.join(current_line + [word])) > 30:
-            lines.append(' '.join(current_line))
-            current_line = [word]
-        else:
-            current_line.append(word)
-
-    if current_line:
-        lines.append(' '.join(current_line))
-
-    return lines
-
-def create_bar_plot(ax: plt.Axes,
-                   normalized_data: list[list[float]],
-                   group_benchmarks: list[str],
-                   non_baseline_runs: list[str],
-                   latest_results: dict[str, LatestResults],
-                   benchmarks: list[BenchmarkSeries],
-                   baseline_name: str) -> float:
-    x = np.arange(len(group_benchmarks))
-    width = 0.8 / len(non_baseline_runs)
-    max_height = 0
-    benchmark_map = {b.label: b for b in benchmarks}
-
-    for i, (run_name, run_data) in enumerate(zip(non_baseline_runs, normalized_data)):
-        offset = width * i - width * (len(non_baseline_runs) - 1) / 2
-        positions = x + offset
-        valid_data = [v if v is not None else 0 for v in run_data]
-        rects = ax.bar(positions, valid_data, width, label=run_name)
-
-        for rect, value, benchmark_label in zip(rects, run_data, group_benchmarks):
-            if value is not None:
-                height = rect.get_height()
-                if height > max_height:
-                    max_height = height
-
-                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
-                       f'{value:.1f}%',
-                       ha='center', va='bottom')
-
-                benchmark_data = latest_results[benchmark_label].run_values
-                baseline_value = benchmark_data[baseline_name]
-                current_value = benchmark_data[run_name]
-                unit = benchmark_map[benchmark_label].metadata.unit
-
-                tooltip_labels = [
-                    f"Run: {run_name}\n"
-                    f"Value: {current_value:.2f} {unit}\n"
-                    f"Normalized to ({baseline_name}): {baseline_value:.2f} {unit}\n"
-                    f"Normalized: {value:.1f}%"
-                ]
-                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}')
-                mpld3.plugins.connect(ax.figure, tooltip)
-
-    return max_height
-
-def add_chart_elements(ax: plt.Axes,
-                      group_benchmarks: list[str],
-                      group_name: str,
-                      max_height: float) -> None:
-    top_padding = max_height * 0.2
-    ax.set_ylim(0, max_height + top_padding)
-    ax.set_ylabel('Performance relative to baseline (%)')
-    ax.set_title(f'Performance Comparison (Normalized to Baseline) - {group_name} Group')
-    ax.set_xticks([])
-
-    for idx, label in enumerate(group_benchmarks):
-        split_labels = format_benchmark_label(label)
-        for i, sublabel in enumerate(split_labels):
-            y_pos = max_height + (top_padding * 0.5) + 2 - (i * top_padding * 0.15)
-            ax.text(idx, y_pos, sublabel,
-                   ha='center',
-                   style='italic',
-                   color='#666666')
-
-    ax.grid(True, axis='y', alpha=0.2)
-    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
-
-def split_large_groups(benchmark_groups):
-    miscellaneous = []
-    new_groups = defaultdict(list)
-
-    split_happened = False
-    for group, labels in benchmark_groups.items():
-        if len(labels) == 1:
-            miscellaneous.extend(labels)
-        elif len(labels) > 5:
-            split_happened = True
-            mid = len(labels) // 2
-            new_groups[group] = labels[:mid]
-            new_groups[group + '_'] = labels[mid:]
-        else:
-            new_groups[group] = labels
-
-    if miscellaneous:
-        new_groups['Miscellaneous'] = miscellaneous
-
-    if split_happened:
-        return split_large_groups(new_groups)
-    else:
-        return new_groups
-
-def group_benchmark_labels(benchmark_labels):
-    benchmark_groups = defaultdict(list)
-    for label in benchmark_labels:
-        group = re.match(r'^[^_\s]+', label)[0]
-        benchmark_groups[group].append(label)
-    return split_large_groups(benchmark_groups)
-
-def create_normalized_bar_chart(benchmarks: list[BenchmarkSeries], baseline_name: str) -> list[str]:
-    latest_results = get_latest_results(benchmarks)
-
-    run_names = sorted(list(set(
-        name for result in latest_results.values()
-        for name in result.run_values.keys()
-    )))
-
-    if baseline_name not in run_names:
-        return []
-
-    benchmark_labels = [b.label for b in benchmarks]
-
-    benchmark_groups = group_benchmark_labels(benchmark_labels)
-
-    html_charts = []
-
-    for group_name, group_benchmarks in benchmark_groups.items():
-        plt.close('all')
-        non_baseline_runs = [n for n in run_names if n != baseline_name]
-
-        if len(non_baseline_runs) == 0:
-            continue
-
-        normalized_data = prepare_normalized_data(
-            latest_results, benchmarks, group_benchmarks,
-            non_baseline_runs, baseline_name
-        )
-
-        fig, ax = plt.subplots(figsize=(10, 6))
-        max_height = create_bar_plot(
-            ax, normalized_data, group_benchmarks, non_baseline_runs,
-            latest_results, benchmarks, baseline_name
-        )
-        add_chart_elements(ax, group_benchmarks, group_name, max_height)
-
-        plt.tight_layout()
-        html_charts.append(mpld3.fig_to_html(fig))
-        plt.close(fig)
-
-    return html_charts
+class BenchmarkTimeSeries:
+    label: str
+    html: str
 
-def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> str:
+def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkTimeSeries]:
     plt.close('all')
 
     num_benchmarks = len(benchmarks)
     if num_benchmarks == 0:
         return
 
-    fig, axes = plt.subplots(num_benchmarks, 1, figsize=(10, max(4 * num_benchmarks, 30)))
-
-    if num_benchmarks == 1:
-        axes = [axes]
+    html_charts = []
 
-    for idx, benchmark in enumerate(benchmarks):
-        ax = axes[idx]
+    for _, benchmark in enumerate(benchmarks):
+        fig, ax = plt.subplots(figsize=(10, 4))
 
         for run in benchmark.runs:
             sorted_points = sorted(run.results, key=lambda x: x.date)
@@ -277,13 +77,12 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.grid(True, alpha=0.2)
         ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
-        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
 
-    plt.tight_layout()
-    html = mpld3.fig_to_html(fig)
+        plt.tight_layout()
+        html_charts.append(BenchmarkTimeSeries(html= mpld3.fig_to_html(fig), label= benchmark.label))
+        plt.close(fig)
 
-    plt.close(fig)
-    return html
+    return html_charts
 
 def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[BenchmarkSeries]:
     benchmark_metadata: dict[str, BenchmarkMetadata] = {}
@@ -319,12 +118,10 @@ def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: li
     return benchmark_series
 
 def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_names: list[str]) -> str:
-    baseline_name = compare_names[0]
     benchmarks = process_benchmark_data(benchmark_runs, compare_names)
 
-    comparison_html_charts = create_normalized_bar_chart(benchmarks, baseline_name)
-    timeseries_html = create_time_series_chart(benchmarks, github_repo)
-    comparison_charts_html = '\n'.join(f'<div class="chart"><div>{chart}</div></div>' for chart in comparison_html_charts)
+    timeseries = create_time_series_chart(benchmarks, github_repo)
+    timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}"><div>{ts.html}</div></div>' for ts in timeseries)
 
     html_template = f"""
     <!DOCTYPE html>
@@ -375,18 +172,44 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
                     margin-bottom: 16px;
                 }}
             }}
+            .filter-container {{
+                text-align: center;
+                margin-bottom: 24px;
+            }}
+            .filter-container input {{
+                padding: 8px;
+                font-size: 16px;
+                border: 1px solid #ccc;
+                border-radius: 4px;
+                width: 400px;
+                max-width: 100%;
+            }}
         </style>
+        <script>
+            function filterCharts() {{
+                const regexInput = document.getElementById('bench-filter').value;
+                const regex = new RegExp(regexInput, 'i');
+                const charts = document.querySelectorAll('.chart');
+                charts.forEach(chart => {{
+                    const label = chart.getAttribute('data-label');
+                    if (regex.test(label)) {{
+                        chart.style.display = '';
+                    }} else {{
+                        chart.style.display = 'none';
+                    }}
+                }});
+            }}
+        </script>
     </head>
     <body>
         <div class="container">
             <h1>Benchmark Results</h1>
-            <h2>Latest Results Comparison</h2>
-            <div class="chart">
-                {comparison_charts_html}
+            <div class="filter-container">
+                <input type="text" id="bench-filter" placeholder="Regex..." oninput="filterCharts()">
             </div>
             <h2>Historical Results</h2>
-            <div class="chart">
-                {timeseries_html}
+            <div class="charts">
+                {timeseries_charts_html}
             </div>
         </div>
     </body>

From 65684c69417792e03aa118f43abf2ff01a056b59 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 5 Dec 2024 15:27:16 +0100
Subject: [PATCH 081/148] [benchmarks] add L0 submit kernel benchmark

---
 scripts/benchmarks/benches/compute.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 169ec0cc64..aaeae5a952 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -50,6 +50,8 @@ def benchmarks(self) -> list[Benchmark]:
             return []
 
         benches = [
+            SubmitKernelL0(self, 0),
+            SubmitKernelL0(self, 1),
             SubmitKernelSYCL(self, 0),
             SubmitKernelSYCL(self, 1),
             QueueInOrderMemcpy(self, 0, 'Device', 'Device', 1024),
@@ -184,6 +186,26 @@ def bin_args(self) -> list[str]:
             "--KernelExecTime=1"
         ]
 
+class SubmitKernelL0(ComputeBenchmark):
+    def __init__(self, bench, ioq):
+        self.ioq = ioq
+        super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_l0 SubmitKernel {order}"
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--Ioq={self.ioq}",
+            "--DiscardEvents=0",
+            "--MeasureCompletion=0",
+            "--iterations=100000",
+            "--Profiling=0",
+            "--NumKernels=10",
+            "--KernelExecTime=1"
+        ]
+
 class ExecImmediateCopyQueue(ComputeBenchmark):
     def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
         self.ioq = ioq

From 01499a5d179de02e992b1bfc49c2dee6ace6baa4 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Thu, 5 Dec 2024 18:00:54 +0000
Subject: [PATCH 082/148] [Benchmarks] Fix unit parsing for compute-benchmarks

A few becnhmarks were showing 'unknow' as unit
---
 scripts/benchmarks/benches/compute.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index aaeae5a952..6038d96ed2 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -86,7 +86,7 @@ def parse_unit_type(compute_unit):
         return "instr"
     elif "[us]" in compute_unit:
         return "μs"
-    return "unknown"
+    return compute_unit.replace("[", "").replace("]", "")
 
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
@@ -279,6 +279,10 @@ def __init__(self, bench, type, size, placement):
     def name(self):
         return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
 
+    # measurement is in GB/s
+    def lower_is_better(self):
+        return False
+
     def bin_args(self) -> list[str]:
         return [
             "--iterations=10000",

From 187ac04a5b43248bc841c1841f01d07510c2711d Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 6 Dec 2024 00:02:49 +0000
Subject: [PATCH 083/148] [Tests] replace zeCallsMap with zelTracer

---
 source/adapters/level_zero/common.cpp         |  5 ---
 test/adapters/level_zero/CMakeLists.txt       | 16 ++------
 .../adapters/level_zero/event_cache_tests.cpp | 38 ++++++++++++++-----
 .../multi_device_event_cache_tests.cpp        | 24 +++++++++---
 test/adapters/level_zero/zeCallMap.cpp        | 13 -------
 test/adapters/level_zero/ze_tracer_common.hpp | 31 +++++++++++++++
 6 files changed, 83 insertions(+), 44 deletions(-)
 delete mode 100644 test/adapters/level_zero/zeCallMap.cpp
 create mode 100644 test/adapters/level_zero/ze_tracer_common.hpp

diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index da7f624013..f2ea325202 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -86,12 +86,7 @@ bool setEnvVar(const char *name, const char *value) {
 
 ZeUSMImportExtension ZeUSMImport;
 
-// This will count the calls to Level-Zero
-// TODO: remove the ifdef once
-// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented
-#ifndef UR_L0_CALL_COUNT_IN_TESTS
 std::map<std::string, int> *ZeCallCount = nullptr;
-#endif
 
 inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
   switch (ZeError) {
diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt
index 8fe062b38b..d74d08311b 100644
--- a/test/adapters/level_zero/CMakeLists.txt
+++ b/test/adapters/level_zero/CMakeLists.txt
@@ -43,24 +43,16 @@ if(UR_BUILD_ADAPTER_L0)
     endif()
 
     if(NOT WIN32 AND NOT UR_STATIC_ADAPTER_L0)
-        # Make L0 use CallMap from a seprate shared lib so that we can access the map
-        # from the tests. This only seems to work on linux
-        add_library(zeCallMap SHARED zeCallMap.cpp)
-        install_ur_library(zeCallMap)
-        target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS)
-        # TODO: stop exporting internals like this for tests...
-        target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap)
-
         add_adapter_test(level_zero_ze_calls
             FIXTURE DEVICES
             SOURCES
                 event_cache_tests.cpp
             ENVIRONMENT
                 "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
-                "UR_L0_LEAKS_DEBUG=1"
+                "ZE_ENABLE_TRACING_LAYER=1"
         )
 
-        target_link_libraries(test-adapter-level_zero_ze_calls PRIVATE zeCallMap)
+        target_link_libraries(test-adapter-level_zero_ze_calls PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
 
         add_adapter_test(level_zero_multi_queue
             FIXTURE DEVICES
@@ -68,10 +60,10 @@ if(UR_BUILD_ADAPTER_L0)
                 multi_device_event_cache_tests.cpp
             ENVIRONMENT
                 "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
-                "UR_L0_LEAKS_DEBUG=1"
+                "ZE_ENABLE_TRACING_LAYER=1"
         )
 
-        target_link_libraries(test-adapter-level_zero_multi_queue PRIVATE zeCallMap)
+        target_link_libraries(test-adapter-level_zero_multi_queue PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
     endif()
 
     add_adapter_test(level_zero_ipc
diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp
index 5ad970bad1..e62e70ff9c 100644
--- a/test/adapters/level_zero/event_cache_tests.cpp
+++ b/test/adapters/level_zero/event_cache_tests.cpp
@@ -10,12 +10,32 @@
 #include <map>
 #include <string>
 
+#include "ze_tracer_common.hpp"
+
+std::size_t eventCreateCount = 0;
+std::size_t eventDestroyCount = 0;
+
+void OnEnterEventCreate(ze_event_create_params_t *, ze_result_t, void *,
+                        void **) {
+    eventCreateCount++;
+}
+
+void OnEnterEventDestroy(ze_event_destroy_params_t *, ze_result_t, void *,
+                         void **) {
+    eventDestroyCount++;
+}
+
+static std::shared_ptr<_zel_tracer_handle_t> tracer = [] {
+    zel_core_callbacks_t prologue_callbacks{};
+    prologue_callbacks.Event.pfnCreateCb = OnEnterEventCreate;
+    prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy;
+    return enableTracing(prologue_callbacks, {});
+}();
+
 template <typename... Args> auto combineFlags(std::tuple<Args...> tuple) {
     return std::apply([](auto... args) { return (... |= args); }, tuple);
 }
 
-extern std::map<std::string, int> *ZeCallCount;
-
 using FlagsTupleType = std::tuple<ur_queue_flags_t, ur_queue_flags_t,
                                   ur_queue_flags_t, ur_queue_flags_t>;
 
@@ -43,8 +63,8 @@ struct urEventCacheTest : uur::urContextTestWithParam<FlagsTupleType> {
         ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
                                          nullptr, &buffer));
 
-        (*ZeCallCount)["zeEventCreate"] = 0;
-        (*ZeCallCount)["zeEventDestroy"] = 0;
+        eventCreateCount = 0;
+        eventDestroyCount = 0;
     }
 
     void TearDown() override {
@@ -96,9 +116,9 @@ TEST_P(urEventCacheTest, eventsReuseNoVisibleEvent) {
     // TODO: why events are not reused for UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE?
     if ((flags & UR_QUEUE_FLAG_DISCARD_EVENTS) &&
         !(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
-        ASSERT_EQ((*ZeCallCount)["zeEventCreate"], 2);
+        ASSERT_EQ(eventCreateCount, 2);
     } else {
-        ASSERT_GE((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+        ASSERT_GE(eventCreateCount, numIters * numEnqueues);
     }
 }
 
@@ -115,7 +135,7 @@ TEST_P(urEventCacheTest, eventsReuseWithVisibleEvent) {
         verifyData();
     }
 
-    ASSERT_LT((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+    ASSERT_LT(eventCreateCount, numIters * numEnqueues);
 }
 
 TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) {
@@ -139,9 +159,9 @@ TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) {
         UUR_ASSERT_SUCCESS_OR_EXIT_IF_UNSUPPORTED(urQueueFinish(queue));
     }
 
-    ASSERT_GE((*ZeCallCount)["zeEventCreate"], waitEveryN);
+    ASSERT_GE(eventCreateCount, waitEveryN);
     // TODO: why there are more events than this?
-    // ASSERT_LE((*ZeCallCount)["zeEventCreate"],  waitEveryN * 2 + 2);
+    // ASSERT_LE(eventCreateCount,  waitEveryN * 2 + 2);
 }
 
 template <typename T>
diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
index c30100557c..fd0320547d 100644
--- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp
+++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
@@ -10,7 +10,21 @@
 #include <map>
 #include <string>
 
-extern std::map<std::string, int> *ZeCallCount;
+#include "ze_tracer_common.hpp"
+
+size_t zeCommandListAppendWaitOnEventsCount = 0;
+
+void OnAppendWaitOnEventsCb(ze_command_list_append_wait_on_events_params_t *,
+                            ze_result_t, void *, void **) {
+    zeCommandListAppendWaitOnEventsCount++;
+}
+
+static std::shared_ptr<_zel_tracer_handle_t> tracer = [] {
+    zel_core_callbacks_t prologue_callbacks{};
+    prologue_callbacks.CommandList.pfnAppendWaitOnEventsCb =
+        OnAppendWaitOnEventsCb;
+    return enableTracing(prologue_callbacks, {});
+}();
 
 using urMultiQueueMultiDeviceEventCacheTest = uur::urAllDevicesTest;
 TEST_F(urMultiQueueMultiDeviceEventCacheTest,
@@ -54,7 +68,7 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
     uur::raii::Event event = nullptr;
     uur::raii::Event eventWait = nullptr;
     uur::raii::Event eventWaitDummy = nullptr;
-    (*ZeCallCount)["zeCommandListAppendWaitOnEvents"] = 0;
+    zeCommandListAppendWaitOnEventsCount = 0;
     EXPECT_SUCCESS(
         urEventCreateWithNativeHandle(0, context2, nullptr, eventWait.ptr()));
     EXPECT_SUCCESS(urEventCreateWithNativeHandle(0, context1, nullptr,
@@ -63,7 +77,7 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
         urEnqueueEventsWait(queue1, 1, eventWaitDummy.ptr(), eventWait.ptr()));
     EXPECT_SUCCESS(
         urEnqueueEventsWait(queue2, 1, eventWait.ptr(), event.ptr()));
-    EXPECT_EQ((*ZeCallCount)["zeCommandListAppendWaitOnEvents"], 2);
+    EXPECT_EQ(zeCommandListAppendWaitOnEventsCount, 2);
     ASSERT_SUCCESS(urEventRelease(eventWaitDummy.get()));
     ASSERT_SUCCESS(urEventRelease(eventWait.get()));
     ASSERT_SUCCESS(urEventRelease(event.get()));
@@ -89,7 +103,7 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
     uur::raii::Event event = nullptr;
     uur::raii::Event eventWait = nullptr;
     uur::raii::Event eventWaitDummy = nullptr;
-    (*ZeCallCount)["zeCommandListAppendWaitOnEvents"] = 0;
+    zeCommandListAppendWaitOnEventsCount = 0;
     EXPECT_SUCCESS(
         urEventCreateWithNativeHandle(0, context2, nullptr, eventWait.ptr()));
     EXPECT_SUCCESS(urEventCreateWithNativeHandle(0, context1, nullptr,
@@ -98,7 +112,7 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
         urEnqueueEventsWait(queue1, 1, eventWaitDummy.ptr(), eventWait.ptr()));
     EXPECT_SUCCESS(
         urEnqueueEventsWait(queue2, 1, eventWait.ptr(), event.ptr()));
-    EXPECT_EQ((*ZeCallCount)["zeCommandListAppendWaitOnEvents"], 3);
+    EXPECT_EQ(zeCommandListAppendWaitOnEventsCount, 3);
     ASSERT_SUCCESS(urEventRelease(eventWaitDummy.get()));
     ASSERT_SUCCESS(urEventRelease(eventWait.get()));
     ASSERT_SUCCESS(urEventRelease(event.get()));
diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp
deleted file mode 100644
index c2e47b856d..0000000000
--- a/test/adapters/level_zero/zeCallMap.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-// See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <map>
-#include <string>
-
-// Map used by L0 adapter to count the number of calls to each L0 function
-// Lifetime is managed by the adapter, this variable is defined here
-// only so that we can read it from the tests.
-__attribute__((visibility("default"))) std::map<std::string, int> *ZeCallCount =
-    nullptr;
diff --git a/test/adapters/level_zero/ze_tracer_common.hpp b/test/adapters/level_zero/ze_tracer_common.hpp
new file mode 100644
index 0000000000..ed33eb30a5
--- /dev/null
+++ b/test/adapters/level_zero/ze_tracer_common.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "uur/fixtures.h"
+
+#include <level_zero/layers/zel_tracing_api.h>
+#include <loader/ze_loader.h>
+
+#include <memory>
+
+std::shared_ptr<_zel_tracer_handle_t>
+enableTracing(zel_core_callbacks_t prologueCallbacks,
+              zel_core_callbacks_t epilogueCallbacks) {
+    EXPECT_EQ(zeInit(ZE_INIT_FLAG_GPU_ONLY), ZE_RESULT_SUCCESS);
+
+    zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC,
+                                     nullptr, nullptr};
+    zel_tracer_handle_t tracer = nullptr;
+    EXPECT_EQ(zelTracerCreate(&tracer_desc, &tracer), ZE_RESULT_SUCCESS);
+
+    EXPECT_EQ(zelTracerSetPrologues(tracer, &prologueCallbacks),
+              ZE_RESULT_SUCCESS);
+    EXPECT_EQ(zelTracerSetEpilogues(tracer, &epilogueCallbacks),
+              ZE_RESULT_SUCCESS);
+    EXPECT_EQ(zelTracerSetEnabled(tracer, true), ZE_RESULT_SUCCESS);
+
+    return std::shared_ptr<_zel_tracer_handle_t>(
+        tracer, [](zel_tracer_handle_t tracer) { zelTracerDestroy(tracer); });
+}

From 1e4e23a513a5cf774bdbcc3022fc63d33655dc5a Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Fri, 6 Dec 2024 14:49:21 +0100
Subject: [PATCH 084/148] [benchmarks] add support for stddev

This makes use of the recently added stddev calculations to
display errorbars on the html output timeline chart.
---
 scripts/benchmarks/benches/compute.py |  9 ++++++---
 scripts/benchmarks/benches/result.py  |  6 ++++--
 scripts/benchmarks/main.py            |  6 ++++--
 scripts/benchmarks/output_html.py     | 23 +++++++++++++++++++----
 4 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 6038d96ed2..f4ab70bf4e 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -118,9 +118,9 @@ def run(self, env_vars) -> list[Result]:
         result = self.run_bench(command, env_vars)
         parsed_results = self.parse_output(result)
         ret = []
-        for label, mean, unit in parsed_results:
+        for label, median, stddev, unit in parsed_results:
             extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
+            ret.append(Result(label=self.name() + extra_label, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 
     def parse_output(self, output):
@@ -135,8 +135,11 @@ def parse_output(self, output):
             try:
                 label = data_row[0]
                 mean = float(data_row[1])
+                median = float(data_row[2])
+                # compute benchmarks report stddev as %
+                stddev = mean * (float(data_row[3].strip('%')) / 100.0)
                 unit = data_row[7]
-                results.append((label, mean, unit))
+                results.append((label, median, stddev, unit))
             except (ValueError, IndexError) as e:
                 raise ValueError(f"Error parsing output: {e}")
         if len(results) == 0:
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 336039c342..1442b6dea6 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -18,12 +18,14 @@ class Result:
     stdout: str
     passed: bool = True
     unit: str = ""
-    # values should not be set by the benchmark
+    # stddev can be optionally set by the benchmark,
+    # if not set, it will be calculated automatically.
+    stddev: float = 0.0
+    # values below should not be set by the benchmark
     name: str = ""
     lower_is_better: bool = True
     git_hash: str = ''
     date: Optional[datetime] = None
-    stddev: float = 0.0
 
 @dataclass_json
 @dataclass
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 6a2f8a8273..614fc69197 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -103,7 +103,10 @@ def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result
         rlist.sort(key=lambda res: res.value)
         median_index = len(rlist) // 2
         median_result = rlist[median_index]
-        median_result.stddev = stddev
+
+        # only override the stddev if not already set
+        if median_result.stddev == 0.0:
+            median_result.stddev = stddev
 
         processed.append(median_result)
 
@@ -160,7 +163,6 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                 if valid:
                     break
             results += processed
-
         except Exception as e:
             if options.exit_on_failure:
                 raise e
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 6233ff900a..832b9da56f 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -32,24 +32,32 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
 
     num_benchmarks = len(benchmarks)
     if num_benchmarks == 0:
-        return
+        return []
 
     html_charts = []
 
     for _, benchmark in enumerate(benchmarks):
         fig, ax = plt.subplots(figsize=(10, 4))
 
+        all_values = []
+        all_stddevs = []
+
         for run in benchmark.runs:
             sorted_points = sorted(run.results, key=lambda x: x.date)
             dates = [point.date for point in sorted_points]
             values = [point.value for point in sorted_points]
+            stddevs = [point.stddev for point in sorted_points]
+
+            all_values.extend(values)
+            all_stddevs.extend(stddevs)
 
-            ax.plot_date(dates, values, '-', label=run.name, alpha=0.5)
+            ax.errorbar(dates, values, yerr=stddevs, fmt='-', label=run.name, alpha=0.5)
             scatter = ax.scatter(dates, values, picker=True)
 
             tooltip_labels = [
                 f"Date: {point.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
-                f"Value: {point.value:.2f}\n"
+                f"Value: {point.value:.2f} {benchmark.metadata.unit}\n"
+                f"Stddev: {point.stddev:.2f} {benchmark.metadata.unit}\n"
                 f"Git Hash: {point.git_hash}"
                 for point in sorted_points
             ]
@@ -62,6 +70,13 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
                 targets=targets)
             mpld3.plugins.connect(fig, tooltip)
 
+        # This is so that the stddev doesn't fill the entire y axis on the chart
+        if all_values and all_stddevs:
+            max_value = max(all_values)
+            min_value = min(all_values)
+            max_stddev = max(all_stddevs)
+            ax.set_ylim(min_value - 3 * max_stddev, max_value + 3 * max_stddev)
+
         ax.set_title(benchmark.label, pad=20)
         performance_indicator = "lower is better" if benchmark.metadata.lower_is_better else "higher is better"
         ax.text(0.5, 1.05, f"({performance_indicator})",
@@ -79,7 +94,7 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
 
         plt.tight_layout()
-        html_charts.append(BenchmarkTimeSeries(html= mpld3.fig_to_html(fig), label= benchmark.label))
+        html_charts.append(BenchmarkTimeSeries(html=mpld3.fig_to_html(fig), label=benchmark.label))
         plt.close(fig)
 
     return html_charts

From 73ba29bfe9dfbd8ac985051f7a81a349ed4e3e08 Mon Sep 17 00:00:00 2001
From: "chedy.najjar" <chedy.najjar@codeplay.com>
Date: Fri, 27 Sep 2024 11:57:40 +0100
Subject: [PATCH 085/148] [CUDA][Bindless] Fix memory leak in interop mapping

* added a map to ur_device_handle_t_.
* Capture the leaking Cumipmappedarray in
  urBindlessImagesMapExternalArrayExp into map.
* Update urBindlessImagesImageFreeExp to check if the Cuarray is
  derived, then destroy the corresponding Cumipmappedarray.
---
 source/adapters/cuda/device.hpp | 3 +++
 source/adapters/cuda/image.cpp  | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index 3654f2bb36..d9f6310103 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -114,6 +114,9 @@ struct ur_device_handle_t_ {
   bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };
 
   uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
+
+  // bookkeeping for mipmappedArray leaks in Mapping external Memory
+  std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;
 };
 
 int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 4840553cc1..c11a85b293 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -430,6 +430,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
   ScopedContext Active(hDevice);
   try {
     UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem));
+    if (auto it = hDevice->ChildCuarrayFromMipmapMap.find((CUarray)hImageMem);
+        it != hDevice->ChildCuarrayFromMipmapMap.end()) {
+      UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)it->second));
+      hDevice->ChildCuarrayFromMipmapMap.erase(it);
+    }
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
@@ -1104,6 +1109,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
       CUarray memArray;
       UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));
 
+      hDevice->ChildCuarrayFromMipmapMap.emplace(memArray, memMipMap);
+
       *phImageMem = (ur_exp_image_mem_native_handle_t)memArray;
     }
 

From a99c36963b7f037770c90d108c1857af5b2e91ee Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 6 Dec 2024 17:43:48 +0000
Subject: [PATCH 086/148] [Tests] fix segfault in
 multi_device_event_cache_tests

Using event created on a context associated with device1
on a queue associated with a different device was causing
as segault on urEnqueueEventsWait
---
 .../level_zero/multi_device_event_cache_tests.cpp  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
index fd0320547d..6ad5b0a9ef 100644
--- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp
+++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
@@ -54,12 +54,12 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
     ASSERT_SUCCESS(urDevicePartition(devices[0], &properties, numSubDevices,
                                      sub_devices.data(), nullptr));
     uur::raii::Context context1 = nullptr;
-    ASSERT_SUCCESS(
-        urContextCreate(1, &sub_devices[0], nullptr, context1.ptr()));
+    ASSERT_SUCCESS(urContextCreate(sub_devices.size(), &sub_devices[0], nullptr,
+                                   context1.ptr()));
     ASSERT_NE(nullptr, context1);
     uur::raii::Context context2 = nullptr;
-    ASSERT_SUCCESS(
-        urContextCreate(1, &sub_devices[1], nullptr, context2.ptr()));
+    ASSERT_SUCCESS(urContextCreate(sub_devices.size(), &sub_devices[0], nullptr,
+                                   context2.ptr()));
     ASSERT_NE(nullptr, context2);
     ur_queue_handle_t queue1 = nullptr;
     ASSERT_SUCCESS(urQueueCreate(context1, sub_devices[0], 0, &queue1));
@@ -91,10 +91,12 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
         GTEST_SKIP();
     }
     uur::raii::Context context1 = nullptr;
-    ASSERT_SUCCESS(urContextCreate(1, &devices[0], nullptr, context1.ptr()));
+    ASSERT_SUCCESS(
+        urContextCreate(devices.size(), &devices[0], nullptr, context1.ptr()));
     ASSERT_NE(nullptr, context1);
     uur::raii::Context context2 = nullptr;
-    ASSERT_SUCCESS(urContextCreate(1, &devices[1], nullptr, context2.ptr()));
+    ASSERT_SUCCESS(
+        urContextCreate(devices.size(), &devices[0], nullptr, context2.ptr()));
     ASSERT_NE(nullptr, context2);
     ur_queue_handle_t queue1 = nullptr;
     ASSERT_SUCCESS(urQueueCreate(context1, devices[0], 0, &queue1));

From 03c81a7c53f5af7fd0c85970f938f70b1d6eab75 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 6 Dec 2024 17:46:26 +0000
Subject: [PATCH 087/148] [Tests] remove unnecessary call to
 urEventCreateWithNativeHandle

This had no effect on the test as UR always allocates a new object
for the signal event.
---
 test/adapters/level_zero/multi_device_event_cache_tests.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
index 6ad5b0a9ef..7848bb1003 100644
--- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp
+++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
@@ -69,8 +69,6 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
     uur::raii::Event eventWait = nullptr;
     uur::raii::Event eventWaitDummy = nullptr;
     zeCommandListAppendWaitOnEventsCount = 0;
-    EXPECT_SUCCESS(
-        urEventCreateWithNativeHandle(0, context2, nullptr, eventWait.ptr()));
     EXPECT_SUCCESS(urEventCreateWithNativeHandle(0, context1, nullptr,
                                                  eventWaitDummy.ptr()));
     EXPECT_SUCCESS(
@@ -106,8 +104,6 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
     uur::raii::Event eventWait = nullptr;
     uur::raii::Event eventWaitDummy = nullptr;
     zeCommandListAppendWaitOnEventsCount = 0;
-    EXPECT_SUCCESS(
-        urEventCreateWithNativeHandle(0, context2, nullptr, eventWait.ptr()));
     EXPECT_SUCCESS(urEventCreateWithNativeHandle(0, context1, nullptr,
                                                  eventWaitDummy.ptr()));
     EXPECT_SUCCESS(

From dbf1b7073174777e2dbe01f40739f72f70e39302 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Thu, 5 Dec 2024 21:45:55 +0000
Subject: [PATCH 088/148] [L0] Fixed getImmCmdList returning cmdlist with wrong
 properties

When using counterbased events, sometimes when retrieving commandlist, in getImmCmdlist we return a cmdlist that is not in
order.

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/queue.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 40bf4b78bf..1caeee50e2 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -2398,7 +2398,11 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() {
   uint32_t QueueIndex, QueueOrdinal;
   auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
 
-  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
+  if ((ImmCmdLists[Index] != Queue->CommandListMap.end()) &&
+      (!Queue->CounterBasedEventsEnabled ||
+       (Queue->CounterBasedEventsEnabled &&
+        (ImmCmdLists[Index]->second.ZeQueueDesc.flags &
+         ZE_COMMAND_QUEUE_FLAG_IN_ORDER))))
     return ImmCmdLists[Index];
 
   ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;

From ede9b14e1eb313c5a6b707c334a1a500f718b719 Mon Sep 17 00:00:00 2001
From: Tim Creech <timothy.m.creech@intel.com>
Date: Fri, 6 Dec 2024 10:19:21 -0500
Subject: [PATCH 089/148] [CMake] Add UR_USE_EXTERNAL_UMF to find pre-built UMF

For example, by configuring with `-Dumf_ROOT=/path/to/umf-install`.
---
 source/common/CMakeLists.txt | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index a4b1a8a8c3..c81dedbe8a 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -49,14 +49,22 @@ if (UR_STATIC_ADAPTER_L0)
     endif()
 endif()
 
-set(UMF_BUILD_TESTS OFF CACHE INTERNAL "Build UMF tests")
-set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples")
-set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library")
-set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
-set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider")
-
-FetchContent_MakeAvailable(unified-memory-framework)
-FetchContent_GetProperties(unified-memory-framework)
+set(UR_USE_EXTERNAL_UMF "" CACHE BOOL "Use a pre-built UMF")
+
+if (UR_USE_EXTERNAL_UMF)
+  find_package(umf REQUIRED)
+  # Add an alias matching the FetchContent case
+  add_library(umf::headers ALIAS umf::umf_headers)
+else()
+  set(UMF_BUILD_TESTS OFF CACHE INTERNAL "Build UMF tests")
+  set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples")
+  set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library")
+  set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
+  set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider")
+
+  FetchContent_MakeAvailable(unified-memory-framework)
+  FetchContent_GetProperties(unified-memory-framework)
+endif()
 
 if(UR_ENABLE_LATENCY_HISTOGRAM)
     set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "")

From 318978158a8c1f09fff38eba8b9ee4ca12a29af1 Mon Sep 17 00:00:00 2001
From: Tim Creech <timothy.m.creech@intel.com>
Date: Sat, 7 Dec 2024 09:41:26 -0500
Subject: [PATCH 090/148] fixup: initialize cache variable with a bool

---
 source/common/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index c81dedbe8a..5278649e9d 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -49,7 +49,7 @@ if (UR_STATIC_ADAPTER_L0)
     endif()
 endif()
 
-set(UR_USE_EXTERNAL_UMF "" CACHE BOOL "Use a pre-built UMF")
+set(UR_USE_EXTERNAL_UMF OFF CACHE BOOL "Use a pre-built UMF")
 
 if (UR_USE_EXTERNAL_UMF)
   find_package(umf REQUIRED)

From f74b2d7e4fb19aee4c4794cdb2ebc1d87dce02b0 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sat, 7 Dec 2024 16:46:06 -0800
Subject: [PATCH 091/148] add a few missing Intel GPU device queries also, fix
 the device ID query

---
 source/adapters/opencl/device.cpp | 78 ++++++++++++++++++++++++++-----
 1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index b33d637a84..f6f6ee36c8 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -323,6 +323,14 @@ static cl_int mapURDeviceInfoToCL(ur_device_info_t URPropName) {
     return CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL;
   case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
     return CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL;
+  case UR_DEVICE_INFO_GPU_EU_SLICES:
+    return CL_DEVICE_NUM_SLICES_INTEL;
+  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+    return CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL;
+  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+    return CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL;
+  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+    return CL_DEVICE_NUM_THREADS_PER_EU_INTEL;
   case UR_DEVICE_INFO_IP_VERSION:
     return CL_DEVICE_IP_VERSION_INTEL;
   default:
@@ -369,18 +377,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_DEVICE_ID: {
     bool Supported = false;
     UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
-        cl_adapter::cast<cl_device_id>(hDevice), {"cl_khr_pci_bus_info"},
+        cl_adapter::cast<cl_device_id>(hDevice), {"cl_intel_device_attribute_query"},
         Supported));
 
     if (!Supported) {
       return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
     }
 
-    cl_device_pci_bus_info_khr PciInfo = {};
+    cl_uint DeviceId = {};
     CL_RETURN_ON_FAILURE(clGetDeviceInfo(
-        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_PCI_BUS_INFO_KHR,
-        sizeof(PciInfo), &PciInfo, nullptr));
-    return ReturnValue(PciInfo.pci_device);
+        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_ID_INTEL,
+        sizeof(DeviceId), &DeviceId, nullptr));
+    return ReturnValue(DeviceId);
   }
 
   case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: {
@@ -993,6 +1001,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
     return UR_RESULT_SUCCESS;
   }
+  case UR_DEVICE_INFO_PCI_ADDRESS: {
+    bool Supported = false;
+    UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
+        cl_adapter::cast<cl_device_id>(hDevice), {"cl_khr_pci_bus_info"},
+        Supported));
+
+    if (!Supported) {
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+    }
+
+    cl_device_pci_bus_info_khr PciInfo = {};
+    CL_RETURN_ON_FAILURE(clGetDeviceInfo(
+        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_PCI_BUS_INFO_KHR,
+        sizeof(PciInfo), &PciInfo, nullptr));
+
+    constexpr size_t AddressBufferSize = 13;
+    char AddressBuffer[AddressBufferSize];
+    std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
+                  PciInfo.pci_domain,
+                  PciInfo.pci_bus,
+                  PciInfo.pci_device,
+                  PciInfo.pci_function);
+    return ReturnValue(AddressBuffer);
+  }
+  case UR_DEVICE_INFO_GPU_EU_COUNT: {
+    /* The EU count can be queried using CL_DEVICE_MAX_COMPUTE_UNITS for Intel
+     * GPUs. */
+
+    bool Supported;
+    UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
+        cl_adapter::cast<cl_device_id>(hDevice),
+        {"cl_intel_device_attribute_query"}, Supported));
+    if (!Supported) {
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+    }
+
+    cl_device_type CLType;
+    CL_RETURN_ON_FAILURE(
+        clGetDeviceInfo(cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_TYPE,
+                        sizeof(cl_device_type), &CLType, nullptr));
+    if (!(CLType & CL_DEVICE_TYPE_GPU)) {
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+    }
+
+    CL_RETURN_ON_FAILURE(clGetDeviceInfo(
+        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_MAX_COMPUTE_UNITS,
+        propSize, pPropValue, pPropSizeRet));
+
+    return UR_RESULT_SUCCESS;
+  }
+  case UR_DEVICE_INFO_GPU_EU_SLICES:
+  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_IP_VERSION: {
     bool Supported;
     UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
@@ -1080,13 +1142,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    * the Registry. */
   case UR_DEVICE_INFO_COMPONENT_DEVICES:
   case UR_DEVICE_INFO_COMPOSITE_DEVICE:
-  case UR_DEVICE_INFO_PCI_ADDRESS:
-  case UR_DEVICE_INFO_GPU_EU_COUNT:
   case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
-  case UR_DEVICE_INFO_GPU_EU_SLICES:
-  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
-  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
-  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
   /* This enums have no equivalent in OpenCL */
   case UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP:

From 7e888fd0a842477eab96abd5a95a08095bac9af8 Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Mon, 9 Dec 2024 16:50:48 +0800
Subject: [PATCH 092/148] update missing change

---
 source/loader/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index 1837108645..aaca3b1569 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -165,10 +165,12 @@ if(UR_ENABLE_SANITIZER)
     )
 
     if(UR_ENABLE_SYMBOLIZER)
-        target_sources(ur_loader
-            PRIVATE
+        set(symbolizer_sources
             ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/linux/symbolizer.cpp
         )
+        target_sources(ur_loader
+            PRIVATE ${symbolizer_sources}
+        )
         target_include_directories(ur_loader PRIVATE ${LLVM_INCLUDE_DIRS})
         target_link_libraries(ur_loader PRIVATE LLVMSupport LLVMSymbolize)
         # In in-tree build, if LLVM is built with libc++, we also need to build
@@ -183,7 +185,7 @@ if(UR_ENABLE_SANITIZER)
                 OUTPUT_VARIABLE LIBCXX_ABI_PATH
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
             set_property(SOURCE
-                ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/symbolizer.cpp
+                ${symbolizer_sources}
                 APPEND_STRING PROPERTY COMPILE_FLAGS
                 " -stdlib=libc++ ")
             if(NOT EXISTS ${LIBCXX_PATH} OR NOT EXISTS ${LIBCXX_ABI_PATH})

From 10f499a3eca27480dfd247fdeeb7ad857fa1e2f8 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Mon, 9 Dec 2024 13:05:45 +0100
Subject: [PATCH 093/148] [benchmarks] add explicit benchmark groups

This patch adds back the previously removed bar charts,
with one critical difference - we no longer normalize all
results to a baseline 100%, but instead, benchmarks need
to be explicitly grouped for the comparison bar charts
to be generated. The groups need to be chosen such that
the units are all the same, and the values are roughly
similar.
---
 scripts/benchmarks/benches/compute.py |  15 ++-
 scripts/benchmarks/benches/result.py  |   1 +
 scripts/benchmarks/benches/test.py    |  19 +--
 scripts/benchmarks/main.py            |   6 +-
 scripts/benchmarks/output_html.py     | 182 ++++++++++++++++++++++++--
 5 files changed, 203 insertions(+), 20 deletions(-)

diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index f4ab70bf4e..229a50e84d 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -104,6 +104,9 @@ def extra_env_vars(self) -> dict:
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
 
+    def explicit_group(self):
+        return ""
+
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
@@ -120,7 +123,8 @@ def run(self, env_vars) -> list[Result]:
         ret = []
         for label, median, stddev, unit in parsed_results:
             extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            ret.append(Result(label=self.name() + extra_label, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
+            explicit_group = self.explicit_group() + extra_label if self.explicit_group() != "" else ""
+            ret.append(Result(label=self.name() + extra_label, explicit_group=explicit_group, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 
     def parse_output(self, output):
@@ -158,6 +162,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_sycl SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -178,6 +185,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_ur SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -198,6 +208,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_l0 SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 1442b6dea6..c975fa792d 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -18,6 +18,7 @@ class Result:
     stdout: str
     passed: bool = True
     unit: str = ""
+    explicit_group: str = ""
     # stddev can be optionally set by the benchmark,
     # if not set, it will be calculated automatically.
     stddev: float = 0.0
diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py
index 802688f032..efe789f678 100644
--- a/scripts/benchmarks/benches/test.py
+++ b/scripts/benchmarks/benches/test.py
@@ -20,30 +20,31 @@ def setup(self):
 
     def benchmarks(self) -> list[Benchmark]:
         bench_configs = [
-            ("Memory Bandwidth", 2000, 200),
-            ("Latency", 100, 20),
-            ("Throughput", 1500, 150),
-            ("FLOPS", 3000, 300),
-            ("Cache Miss Rate", 250, 25),
+            ("Memory Bandwidth", 2000, 200, "Foo Group"),
+            ("Latency", 100, 20, "Bar Group"),
+            ("Throughput", 1500, 150, "Foo Group"),
+            ("FLOPS", 3000, 300, "Foo Group"),
+            ("Cache Miss Rate", 250, 25, "Bar Group"),
         ]
 
         result = []
-        for base_name, base_value, base_diff in bench_configs:
+        for base_name, base_value, base_diff, group in bench_configs:
             for variant in range(6):
                 value_multiplier = 1.0 + (variant * 0.2)
                 name = f"{base_name} {variant+1}"
                 value = base_value * value_multiplier
                 diff = base_diff * value_multiplier
 
-                result.append(TestBench(name, value, diff))
+                result.append(TestBench(name, value, diff, group))
 
         return result
 
 class TestBench(Benchmark):
-    def __init__(self, name, value, diff):
+    def __init__(self, name, value, diff, group = ''):
         self.bname = name
         self.value = value
         self.diff = diff
+        self.group = group
         super().__init__("")
 
     def name(self):
@@ -58,7 +59,7 @@ def setup(self):
     def run(self, env_vars) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
-            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
+            Result(label=self.name(), explicit_group=self.group, value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
         ]
 
     def teardown(self):
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 614fc69197..ab4adafee6 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -183,6 +183,9 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # should this be configurable?
     history.load(1000)
 
+    # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
+    compare_names = list(dict.fromkeys(compare_names))
+
     for name in compare_names:
         compare_result = history.get_compare(name)
         if compare_result:
@@ -203,7 +206,8 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
         history.save(saved_name, results, save_name is not None)
-        compare_names.append(saved_name)
+        if saved_name not in compare_names:
+            compare_names.append(saved_name)
 
     if options.output_html:
         html_content = generate_html(history.runs, 'oneapi-src/unified-runtime', compare_names)
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 832b9da56f..80524977e1 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 import matplotlib.dates as mdates
 from benches.result import BenchmarkRun, Result
+import numpy as np
 
 @dataclass
 class BenchmarkMetadata:
@@ -23,11 +24,14 @@ class BenchmarkSeries:
     runs: list[BenchmarkRun]
 
 @dataclass
-class BenchmarkTimeSeries:
+class BenchmarkChart:
     label: str
     html: str
 
-def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkTimeSeries]:
+def tooltip_css() -> str:
+    return '.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}'
+
+def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkChart]:
     plt.close('all')
 
     num_benchmarks = len(benchmarks)
@@ -66,7 +70,7 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
                       for point in sorted_points]
 
             tooltip = mpld3.plugins.PointHTMLTooltip(scatter, tooltip_labels,
-                css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}',
+                css=tooltip_css(),
                 targets=targets)
             mpld3.plugins.connect(fig, tooltip)
 
@@ -94,7 +98,104 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
 
         plt.tight_layout()
-        html_charts.append(BenchmarkTimeSeries(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        html_charts.append(BenchmarkChart(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        plt.close(fig)
+
+    return html_charts
+
+@dataclass
+class ExplicitGroup:
+    name: str
+    nnames: int
+    metadata: BenchmarkMetadata
+    runs: dict[str, dict[str, Result]]
+
+def create_explicit_groups(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[ExplicitGroup]:
+    groups = {}
+
+    for run in benchmark_runs:
+        if run.name in compare_names:
+            for res in run.results:
+                if res.explicit_group != '':
+                    if res.explicit_group not in groups:
+                        groups[res.explicit_group] = ExplicitGroup(name=res.explicit_group, nnames=len(compare_names),
+                                metadata=BenchmarkMetadata(unit=res.unit, lower_is_better=res.lower_is_better),
+                                runs={})
+
+                    group = groups[res.explicit_group]
+                    if res.label not in group.runs:
+                        group.runs[res.label] = {name: None for name in compare_names}
+
+                    if group.runs[res.label][run.name] is None:
+                        group.runs[res.label][run.name] = res
+
+    return list(groups.values())
+
+def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChart]:
+    plt.close('all')
+
+    html_charts = []
+
+    for group in groups:
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        x = np.arange(group.nnames)
+        x_labels = []
+        width = 0.8 / len(group.runs)
+
+        max_height = 0
+
+        for i, (run_name, run_results) in enumerate(group.runs.items()):
+            offset = width * i
+
+            positions = x + offset
+            x_labels = run_results.keys()
+            valid_data = [r.value if r is not None else 0 for r in run_results.values()]
+            rects = ax.bar(positions, valid_data, width, label=run_name)
+            # This is a hack to disable all bar_label. Setting labels to empty doesn't work.
+            # We create our own labels below for each bar, this works better in mpld3.
+            ax.bar_label(rects, fmt='')
+
+            for rect, run, res in zip(rects, run_results.keys(), run_results.values()):
+                height = rect.get_height()
+                if height > max_height:
+                    max_height = height
+
+                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
+                                    f'{res.value:.1f}',
+                                    ha='center', va='bottom', fontsize=9)
+
+                tooltip_labels = [
+                    f"Run: {run}\n"
+                    f"Label: {res.label}\n"
+                    f"Value: {res.value:.2f} {res.unit}\n"
+                ]
+                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css=tooltip_css())
+                mpld3.plugins.connect(ax.figure, tooltip)
+
+        ax.set_xticks([])
+        ax.grid(True, axis='y', alpha=0.2)
+        ax.set_ylabel(f"Value ({group.metadata.unit})")
+        ax.legend(loc='upper left')
+        ax.set_title(group.name, pad=20)
+        performance_indicator = "lower is better" if group.metadata.lower_is_better else "higher is better"
+        ax.text(0.5, 1.03, f"({performance_indicator})",
+                ha='center',
+                transform=ax.transAxes,
+                style='italic',
+                fontsize=7,
+                color='#666666')
+
+        for idx, label in enumerate(x_labels):
+            # this is a hack to get labels to show above the legend
+            # we normalize the idx to transAxes transform and offset it a little.
+            x_norm = (idx + 0.3 - ax.get_xlim()[0]) / (ax.get_xlim()[1] - ax.get_xlim()[0])
+            ax.text(x_norm, 1.00, label,
+                transform=ax.transAxes,
+                color='#666666')
+
+        plt.tight_layout()
+        html_charts.append(BenchmarkChart(label=group.name, html=mpld3.fig_to_html(fig)))
         plt.close(fig)
 
     return html_charts
@@ -138,6 +239,11 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
     timeseries = create_time_series_chart(benchmarks, github_repo)
     timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}"><div>{ts.html}</div></div>' for ts in timeseries)
 
+    explicit_groups = create_explicit_groups(benchmark_runs, compare_names)
+
+    bar_charts = create_grouped_bar_charts(explicit_groups)
+    bar_charts_html = '\n'.join(f'<div class="chart" data-label="{bc.label}"><div>{bc.html}</div></div>' for bc in bar_charts)
+
     html_template = f"""
     <!DOCTYPE html>
     <html>
@@ -199,21 +305,72 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
                 width: 400px;
                 max-width: 100%;
             }}
+            details {{
+                margin-bottom: 24px;
+            }}
+            summary {{
+                font-size: 18px;
+                font-weight: 500;
+                cursor: pointer;
+                padding: 12px;
+                background: #e9ecef;
+                border-radius: 8px;
+                user-select: none;
+            }}
+            summary:hover {{
+                background: #dee2e6;
+            }}
         </style>
         <script>
+            function getQueryParam(param) {{
+                const urlParams = new URLSearchParams(window.location.search);
+                return urlParams.get(param);
+            }}
+
             function filterCharts() {{
                 const regexInput = document.getElementById('bench-filter').value;
                 const regex = new RegExp(regexInput, 'i');
                 const charts = document.querySelectorAll('.chart');
+                let timeseriesVisible = false;
+                let barChartsVisible = false;
+
                 charts.forEach(chart => {{
                     const label = chart.getAttribute('data-label');
                     if (regex.test(label)) {{
                         chart.style.display = '';
+                        if (chart.closest('.timeseries')) {{
+                            timeseriesVisible = true;
+                        }} else if (chart.closest('.bar-charts')) {{
+                            barChartsVisible = true;
+                        }}
                     }} else {{
                         chart.style.display = 'none';
                     }}
                 }});
+
+                updateURL(regexInput);
+
+                document.querySelector('.timeseries').open = timeseriesVisible;
+                document.querySelector('.bar-charts').open = barChartsVisible;
             }}
+
+            function updateURL(regex) {{
+                const url = new URL(window.location);
+                if (regex) {{
+                    url.searchParams.set('regex', regex);
+                }} else {{
+                    url.searchParams.delete('regex');
+                }}
+                history.replaceState(null, '', url);
+            }}
+
+            document.addEventListener('DOMContentLoaded', (event) => {{
+                const regexParam = getQueryParam('regex');
+                if (regexParam) {{
+                    document.getElementById('bench-filter').value = regexParam;
+                    filterCharts();
+                }}
+            }});
         </script>
     </head>
     <body>
@@ -222,13 +379,20 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
             <div class="filter-container">
                 <input type="text" id="bench-filter" placeholder="Regex..." oninput="filterCharts()">
             </div>
-            <h2>Historical Results</h2>
-            <div class="charts">
-                {timeseries_charts_html}
-            </div>
+            <details class="timeseries">
+                <summary>Historical Results</summary>
+                <div class="charts">
+                    {timeseries_charts_html}
+                </div>
+            </details>
+            <details class="bar-charts">
+                <summary>Comparisons</summary>
+                <div class="charts">
+                    {bar_charts_html}
+                </div>
+            </details>
         </div>
     </body>
     </html>
     """
-
     return html_template

From 44b4ff98ae7955186387a4e18ecbb8dfe4920d65 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 9 Dec 2024 07:56:24 -0800
Subject: [PATCH 094/148] fix formatting

---
 source/adapters/opencl/device.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index f6f6ee36c8..f92068436b 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -377,18 +377,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_DEVICE_ID: {
     bool Supported = false;
     UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
-        cl_adapter::cast<cl_device_id>(hDevice), {"cl_intel_device_attribute_query"},
-        Supported));
+        cl_adapter::cast<cl_device_id>(hDevice),
+        {"cl_intel_device_attribute_query"}, Supported));
 
     if (!Supported) {
       return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
     }
 
-    cl_uint DeviceId = {};
     CL_RETURN_ON_FAILURE(clGetDeviceInfo(
-        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_ID_INTEL,
-        sizeof(DeviceId), &DeviceId, nullptr));
-    return ReturnValue(DeviceId);
+        cl_adapter::cast<cl_device_id>(hDevice), CL_DEVICE_ID_INTEL, propSize,
+        pPropValue, pPropSizeRet));
+
+    return UR_RESULT_SUCCESS;
   }
 
   case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: {
@@ -1019,9 +1019,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     constexpr size_t AddressBufferSize = 13;
     char AddressBuffer[AddressBufferSize];
     std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
-                  PciInfo.pci_domain,
-                  PciInfo.pci_bus,
-                  PciInfo.pci_device,
+                  PciInfo.pci_domain, PciInfo.pci_bus, PciInfo.pci_device,
                   PciInfo.pci_function);
     return ReturnValue(AddressBuffer);
   }

From bc5d6a695dc844688563b1d95289ae712e3d5ebb Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 6 Dec 2024 21:59:44 +0000
Subject: [PATCH 095/148] [UMF] bump UMF version to v0.10.0 and adjust code

to the new params API
---
 source/adapters/cuda/usm.cpp                  | 33 ++++++-----
 source/adapters/hip/usm.cpp                   | 33 ++++++-----
 source/adapters/level_zero/context.cpp        | 42 ++++++++------
 source/adapters/level_zero/usm.cpp            | 42 ++++++++------
 source/adapters/level_zero/v2/usm.cpp         | 55 ++++++++++++++-----
 source/common/CMakeLists.txt                  |  9 ++-
 .../umf_pools/disjoint_pool_config_parser.cpp |  6 +-
 .../umf_pools/disjoint_pool_config_parser.hpp | 44 ++++++++++++++-
 8 files changed, 184 insertions(+), 80 deletions(-)

diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 8a6ac41b08..863d90cd79 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -403,29 +403,32 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
       umf::memoryProviderMakeUnique<USMHostMemoryProvider>(Context, nullptr)
           .second;
 
+  auto UmfHostParamsHandle = getUmfParamsHandle(
+      DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]);
   HostMemPool =
-      umf::poolMakeUniqueFromOps(
-          umfDisjointPoolOps(), std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
+      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                 UmfHostParamsHandle.get())
           .second;
 
   for (const auto &Device : Context->getDevices()) {
     MemProvider =
         umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
             .second;
-    DeviceMemPool = umf::poolMakeUniqueFromOps(
-                        umfDisjointPoolOps(), std::move(MemProvider),
-                        &this->DisjointPoolConfigs
-                             .Configs[usm::DisjointPoolMemType::Device])
-                        .second;
+    auto UmfDeviceParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]);
+    DeviceMemPool =
+        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                   UmfDeviceParamsHandle.get())
+            .second;
     MemProvider =
         umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
             .second;
-    SharedMemPool = umf::poolMakeUniqueFromOps(
-                        umfDisjointPoolOps(), std::move(MemProvider),
-                        &this->DisjointPoolConfigs
-                             .Configs[usm::DisjointPoolMemType::Shared])
-                        .second;
+    auto UmfSharedParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]);
+    SharedMemPool =
+        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                   UmfSharedParamsHandle.get())
+            .second;
     Context->addPool(this);
   }
 }
@@ -452,6 +455,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
         new ur_usm_pool_handle_t_(Context, PoolDesc));
   } catch (const UsmAllocationException &Ex) {
     return Ex.getError();
+  } catch (umf_result_t e) {
+    return umf::umf2urResult(e);
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
   }
   return UR_RESULT_SUCCESS;
 #else
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 5e28f3592d..90f152daf0 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -345,30 +345,33 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
       umf::memoryProviderMakeUnique<USMHostMemoryProvider>(Context, nullptr)
           .second;
 
+  auto UmfHostParamsHandle = getUmfParamsHandle(
+      DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]);
   HostMemPool =
-      umf::poolMakeUniqueFromOps(
-          umfDisjointPoolOps(), std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
+      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                 UmfHostParamsHandle.get())
           .second;
 
   for (const auto &Device : Context->getDevices()) {
     MemProvider =
         umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
             .second;
-    DeviceMemPool = umf::poolMakeUniqueFromOps(
-                        umfDisjointPoolOps(), std::move(MemProvider),
-                        &this->DisjointPoolConfigs
-                             .Configs[usm::DisjointPoolMemType::Device])
-                        .second;
+    auto UmfDeviceParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]);
+    DeviceMemPool =
+        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                   UmfDeviceParamsHandle.get())
+            .second;
 
     MemProvider =
         umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
             .second;
-    SharedMemPool = umf::poolMakeUniqueFromOps(
-                        umfDisjointPoolOps(), std::move(MemProvider),
-                        &this->DisjointPoolConfigs
-                             .Configs[usm::DisjointPoolMemType::Shared])
-                        .second;
+    auto UmfSharedParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]);
+    SharedMemPool =
+        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                   UmfSharedParamsHandle.get())
+            .second;
     Context->addPool(this);
   }
 }
@@ -395,6 +398,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
         new ur_usm_pool_handle_t_(Context, PoolDesc));
   } catch (const UsmAllocationException &Ex) {
     return Ex.getError();
+  } catch (umf_result_t e) {
+    return umf::umf2urResult(e);
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
   }
   return UR_RESULT_SUCCESS;
 #else
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index fd13dc35df..6603cdd6f8 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -48,6 +48,8 @@ ur_result_t urContextCreate(
     }
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (umf_result_t e) {
+    return umf::umf2urResult(e);
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
@@ -196,36 +198,41 @@ ur_result_t ur_context_handle_t_::initialize() {
     auto MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
                            reinterpret_cast<ur_context_handle_t>(this), Device)
                            .second;
+    auto UmfDeviceParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]);
     DeviceMemPools.emplace(
         std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(umf::poolMakeUniqueFromOps(
-                            umfDisjointPoolOps(), std::move(MemProvider),
-                            &DisjointPoolConfigInstance
-                                 .Configs[usm::DisjointPoolMemType::Device])
+        std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
+                                                   std::move(MemProvider),
+                                                   UmfDeviceParamsHandle.get())
                             .second));
 
     MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
                       reinterpret_cast<ur_context_handle_t>(this), Device)
                       .second;
+
+    auto UmfSharedParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]);
     SharedMemPools.emplace(
         std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(umf::poolMakeUniqueFromOps(
-                            umfDisjointPoolOps(), std::move(MemProvider),
-                            &DisjointPoolConfigInstance
-                                 .Configs[usm::DisjointPoolMemType::Shared])
+        std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
+                                                   std::move(MemProvider),
+                                                   UmfSharedParamsHandle.get())
                             .second));
 
     MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
                       reinterpret_cast<ur_context_handle_t>(this), Device)
                       .second;
+
+    auto UmfSharedROParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance
+            .Configs[usm::DisjointPoolMemType::SharedReadOnly]);
     SharedReadOnlyMemPools.emplace(
         std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(
-            umf::poolMakeUniqueFromOps(
-                umfDisjointPoolOps(), std::move(MemProvider),
-                &DisjointPoolConfigInstance
-                     .Configs[usm::DisjointPoolMemType::SharedReadOnly])
-                .second));
+        std::make_tuple(umf::poolMakeUniqueFromOps(
+                            umfDisjointPoolOps(), std::move(MemProvider),
+                            UmfSharedROParamsHandle.get())
+                            .second));
 
     MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
                       reinterpret_cast<ur_context_handle_t>(this), Device)
@@ -273,10 +280,11 @@ ur_result_t ur_context_handle_t_::initialize() {
   auto MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
                          reinterpret_cast<ur_context_handle_t>(this), nullptr)
                          .second;
+  auto UmfHostParamsHandle = getUmfParamsHandle(
+      DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]);
   HostMemPool =
-      umf::poolMakeUniqueFromOps(
-          umfDisjointPoolOps(), std::move(MemProvider),
-          &DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host])
+      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                 UmfHostParamsHandle.get())
           .second;
 
   MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index 28bdf233e8..55cff6b209 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -709,6 +709,10 @@ ur_result_t urUSMPoolCreate(
 
   } catch (const UsmAllocationException &Ex) {
     return Ex.getError();
+  } catch (umf_result_t e) {
+    return umf2urResult(e);
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
   }
   return UR_RESULT_SUCCESS;
 }
@@ -1051,46 +1055,50 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
       umf::memoryProviderMakeUnique<L0HostMemoryProvider>(Context, nullptr)
           .second;
 
+  auto UmfHostParamsHandle = getUmfParamsHandle(
+      DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]);
   HostMemPool =
-      umf::poolMakeUniqueFromOps(
-          umfDisjointPoolOps(), std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
+      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
+                                 UmfHostParamsHandle.get())
           .second;
 
   for (auto device : Context->Devices) {
     MemProvider =
         umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(Context, device)
             .second;
+    auto UmfDeviceParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]);
     DeviceMemPools.emplace(
         std::piecewise_construct, std::make_tuple(device),
-        std::make_tuple(umf::poolMakeUniqueFromOps(
-                            umfDisjointPoolOps(), std::move(MemProvider),
-                            &this->DisjointPoolConfigs
-                                 .Configs[usm::DisjointPoolMemType::Device])
+        std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
+                                                   std::move(MemProvider),
+                                                   UmfDeviceParamsHandle.get())
                             .second));
 
     MemProvider =
         umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(Context, device)
             .second;
+    auto UmfSharedParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]);
     SharedMemPools.emplace(
         std::piecewise_construct, std::make_tuple(device),
-        std::make_tuple(umf::poolMakeUniqueFromOps(
-                            umfDisjointPoolOps(), std::move(MemProvider),
-                            &this->DisjointPoolConfigs
-                                 .Configs[usm::DisjointPoolMemType::Shared])
+        std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
+                                                   std::move(MemProvider),
+                                                   UmfSharedParamsHandle.get())
                             .second));
 
     MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
                       Context, device)
                       .second;
+    auto UmfSharedROParamsHandle = getUmfParamsHandle(
+        DisjointPoolConfigInstance
+            .Configs[usm::DisjointPoolMemType::SharedReadOnly]);
     SharedReadOnlyMemPools.emplace(
         std::piecewise_construct, std::make_tuple(device),
-        std::make_tuple(
-            umf::poolMakeUniqueFromOps(
-                umfDisjointPoolOps(), std::move(MemProvider),
-                &this->DisjointPoolConfigs
-                     .Configs[usm::DisjointPoolMemType::SharedReadOnly])
-                .second));
+        std::make_tuple(umf::poolMakeUniqueFromOps(
+                            umfDisjointPoolOps(), std::move(MemProvider),
+                            UmfSharedROParamsHandle.get())
+                            .second));
   }
 }
 
diff --git a/source/adapters/level_zero/v2/usm.cpp b/source/adapters/level_zero/v2/usm.cpp
index f31a2b5202..f7396e282f 100644
--- a/source/adapters/level_zero/v2/usm.cpp
+++ b/source/adapters/level_zero/v2/usm.cpp
@@ -82,31 +82,55 @@ descToDisjoinPoolMemType(const usm::pool_descriptor &desc) {
 }
 
 static umf::pool_unique_handle_t
-makePool(umf_disjoint_pool_params_t *poolParams,
+makePool(usm::umf_disjoint_pool_config_t *poolParams,
          usm::pool_descriptor poolDescriptor) {
-  level_zero_memory_provider_params_t params = {};
-  params.level_zero_context_handle = poolDescriptor.hContext->getZeHandle();
-  params.level_zero_device_handle =
+  umf_level_zero_memory_provider_params_handle_t params = NULL;
+  umf_result_t umf_ret = umfLevelZeroMemoryProviderParamsCreate(&params);
+  if (umf_ret != UMF_RESULT_SUCCESS) {
+    throw umf::umf2urResult(umf_ret);
+  }
+
+  umf_ret = umfLevelZeroMemoryProviderParamsSetContext(
+      params, poolDescriptor.hContext->getZeHandle());
+  if (umf_ret != UMF_RESULT_SUCCESS) {
+    throw umf::umf2urResult(umf_ret);
+  };
+
+  ze_device_handle_t level_zero_device_handle =
       poolDescriptor.hDevice ? poolDescriptor.hDevice->ZeDevice : nullptr;
-  params.memory_type = urToUmfMemoryType(poolDescriptor.type);
+
+  umf_ret = umfLevelZeroMemoryProviderParamsSetDevice(params,
+                                                      level_zero_device_handle);
+  if (umf_ret != UMF_RESULT_SUCCESS) {
+    throw umf::umf2urResult(umf_ret);
+  }
+
+  umf_ret = umfLevelZeroMemoryProviderParamsSetMemoryType(
+      params, urToUmfMemoryType(poolDescriptor.type));
+  if (umf_ret != UMF_RESULT_SUCCESS) {
+    throw umf::umf2urResult(umf_ret);
+  }
 
   std::vector<ze_device_handle_t> residentZeHandles;
 
   if (poolDescriptor.type == UR_USM_TYPE_DEVICE) {
-    assert(params.level_zero_device_handle);
+    assert(level_zero_device_handle);
     auto residentHandles =
         poolDescriptor.hContext->getP2PDevices(poolDescriptor.hDevice);
-    residentZeHandles.push_back(params.level_zero_device_handle);
+    residentZeHandles.push_back(level_zero_device_handle);
     for (auto &device : residentHandles) {
       residentZeHandles.push_back(device->ZeDevice);
     }
 
-    params.resident_device_handles = residentZeHandles.data();
-    params.resident_device_count = residentZeHandles.size();
+    umf_ret = umfLevelZeroMemoryProviderParamsSetResidentDevices(
+        params, residentZeHandles.data(), residentZeHandles.size());
+    if (umf_ret != UMF_RESULT_SUCCESS) {
+      throw umf::umf2urResult(umf_ret);
+    }
   }
 
   auto [ret, provider] =
-      umf::providerMakeUniqueFromOps(umfLevelZeroMemoryProviderOps(), &params);
+      umf::providerMakeUniqueFromOps(umfLevelZeroMemoryProviderOps(), params);
   if (ret != UMF_RESULT_SUCCESS) {
     throw umf::umf2urResult(ret);
   }
@@ -118,9 +142,11 @@ makePool(umf_disjoint_pool_params_t *poolParams,
       throw umf::umf2urResult(ret);
     return std::move(poolHandle);
   } else {
+    auto umfParams = getUmfParamsHandle(*poolParams);
+
     auto [ret, poolHandle] =
         umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(provider),
-                                   static_cast<void *>(poolParams));
+                                   static_cast<void *>(umfParams.get()));
     if (ret != UMF_RESULT_SUCCESS)
       throw umf::umf2urResult(ret);
     return std::move(poolHandle);
@@ -199,10 +225,13 @@ ur_result_t urUSMPoolCreate(
         pPoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with
                    ///< ::ur_usm_pool_limits_desc_t
     ur_usm_pool_handle_t *hPool ///< [out] pointer to USM memory pool
-) {
-
+    ) try {
   *hPool = new ur_usm_pool_handle_t_(hContext, pPoolDesc);
   return UR_RESULT_SUCCESS;
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index e24d987567..2a181638ba 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -32,11 +32,10 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    # special branch with cherry-picks for incoming pulldown
-    # contains UMF PRs: #866, #924, and #930
-    # branch was based on commit: 3bae087c9a8c0cbed5bde40f0d5a2
-    # umf-fixes-nov-pulldown: 25.11.2024: Disable libudev in hwloc builds
-    set(UMF_TAG a7b6152b7b095c88ddf34bc7d442eb4c2b3f74d6)
+    # tag v0.10.0
+    # Tagger: Łukasz Stolarczuk <lukasz.stolarczuk@intel.com>
+    # Date:   Mon Dec 9 17:01:43 2024 +0100
+    set(UMF_TAG v0.10.0)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")
diff --git a/source/common/umf_pools/disjoint_pool_config_parser.cpp b/source/common/umf_pools/disjoint_pool_config_parser.cpp
index 0e82072ae2..5a5329ff17 100644
--- a/source/common/umf_pools/disjoint_pool_config_parser.cpp
+++ b/source/common/umf_pools/disjoint_pool_config_parser.cpp
@@ -25,9 +25,13 @@ constexpr auto operator""_GB(unsigned long long x) -> size_t {
     return x * 1024 * 1024 * 1024;
 }
 
+umf_disjoint_pool_config_t::umf_disjoint_pool_config_t()
+    : SlabMinSize(0), MaxPoolableSize(0), Capacity(0),
+      MinBucketSize(UMF_DISJOINT_POOL_MIN_BUCKET_DEFAULT_SIZE), PoolTrace(0),
+      SharedLimits(nullptr), Name("disjoint_pool") {}
+
 DisjointPoolAllConfigs::DisjointPoolAllConfigs(int trace) {
     for (auto &Config : Configs) {
-        Config = umfDisjointPoolParamsDefault();
         Config.PoolTrace = trace;
     }
 
diff --git a/source/common/umf_pools/disjoint_pool_config_parser.hpp b/source/common/umf_pools/disjoint_pool_config_parser.hpp
index 2efb7ddfdc..e3ee1f7b54 100644
--- a/source/common/umf_pools/disjoint_pool_config_parser.hpp
+++ b/source/common/umf_pools/disjoint_pool_config_parser.hpp
@@ -18,12 +18,28 @@
 namespace usm {
 enum DisjointPoolMemType { Host, Device, Shared, SharedReadOnly, All };
 
+typedef struct umf_disjoint_pool_config_t {
+    umf_disjoint_pool_config_t();
+
+    size_t SlabMinSize;
+    size_t MaxPoolableSize;
+    size_t Capacity;
+    size_t MinBucketSize;
+    int PoolTrace;
+    umf_disjoint_pool_shared_limits_t *SharedLimits;
+    const char *Name;
+} umf_disjoint_pool_config_t;
+
+using umfDisjointPoolParamsHandle =
+    std::unique_ptr<umf_disjoint_pool_params_t,
+                    decltype(&umfDisjointPoolParamsDestroy)>;
+
 // Stores configuration for all instances of USM allocator
 class DisjointPoolAllConfigs {
   public:
     size_t EnableBuffers = 1;
     std::shared_ptr<umf_disjoint_pool_shared_limits_t> limits;
-    umf_disjoint_pool_params_t Configs[DisjointPoolMemType::All];
+    umf_disjoint_pool_config_t Configs[DisjointPoolMemType::All];
 
     DisjointPoolAllConfigs(int trace = 0);
 };
@@ -54,6 +70,32 @@ class DisjointPoolAllConfigs {
 // "1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M"
 DisjointPoolAllConfigs parseDisjointPoolConfig(const std::string &config,
                                                int trace = 1);
+
+static inline void UMF_CALL_THROWS(umf_result_t res) {
+    if (res != UMF_RESULT_SUCCESS) {
+        throw res;
+    }
+}
+
+static inline umfDisjointPoolParamsHandle
+getUmfParamsHandle(umf_disjoint_pool_config_t &config) {
+    umf_disjoint_pool_params_handle_t cHandle;
+    UMF_CALL_THROWS(umfDisjointPoolParamsCreate(&cHandle));
+
+    umfDisjointPoolParamsHandle handle(cHandle, &umfDisjointPoolParamsDestroy);
+    UMF_CALL_THROWS(
+        umfDisjointPoolParamsSetSlabMinSize(cHandle, config.SlabMinSize));
+    UMF_CALL_THROWS(umfDisjointPoolParamsSetMaxPoolableSize(
+        cHandle, config.MaxPoolableSize));
+    UMF_CALL_THROWS(umfDisjointPoolParamsSetCapacity(cHandle, config.Capacity));
+    UMF_CALL_THROWS(
+        umfDisjointPoolParamsSetMinBucketSize(cHandle, config.MinBucketSize));
+    UMF_CALL_THROWS(
+        umfDisjointPoolParamsSetSharedLimits(cHandle, config.SharedLimits));
+    UMF_CALL_THROWS(umfDisjointPoolParamsSetName(cHandle, config.Name));
+    UMF_CALL_THROWS(umfDisjointPoolParamsSetTrace(cHandle, config.PoolTrace));
+    return handle;
+}
 } // namespace usm
 
 #endif

From b209eba86e4adf5deefa89603eec80d63ee05b8a Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Wed, 27 Nov 2024 00:48:27 -0800
Subject: [PATCH 096/148] [DevASAN] Do allocation with USM pool to reduce
 memory overhead

Release mapped physical memory according to its dependency may cause
some problems. So, we decide to use USM pool to do allocation to
reduce memory overhead.
---
 .../sanitizer/asan/asan_interceptor.cpp       | 54 +++++++-------
 .../sanitizer/asan/asan_interceptor.hpp       |  7 +-
 .../layers/sanitizer/asan/asan_shadow.cpp     | 72 +++++++------------
 .../layers/sanitizer/asan/asan_shadow.hpp     | 11 +--
 4 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 8d2a9e5ee2..1a1185e1ba 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -36,7 +36,8 @@ AsanInterceptor::~AsanInterceptor() {
     // We must release these objects before releasing adapters, since
     // they may use the adapter in their destructor
     for (const auto &[_, DeviceInfo] : m_DeviceMap) {
-        DeviceInfo->Shadow->Destory();
+        [[maybe_unused]] auto URes = DeviceInfo->Shadow->Destory();
+        assert(URes == UR_RESULT_SUCCESS);
     }
 
     m_Quarantine = nullptr;
@@ -96,6 +97,10 @@ ur_result_t AsanInterceptor::allocateMemory(ur_context_handle_t Context,
 
     void *Allocated = nullptr;
 
+    if (Pool == nullptr) {
+        Pool = ContextInfo->getUSMPool();
+    }
+
     if (Type == AllocType::DEVICE_USM) {
         UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
             Context, Device, Properties, Pool, NeededSize, &Allocated));
@@ -228,16 +233,6 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
             ContextInfo->Stats.UpdateUSMRealFreed(
                 ToFreeAllocInfo->AllocSize, ToFreeAllocInfo->getRedzoneSize());
 
-            if (ToFreeAllocInfo->Type == AllocType::HOST_USM) {
-                for (auto &Device : ContextInfo->DeviceList) {
-                    UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow(
-                        ToFreeAllocInfo));
-                }
-            } else {
-                UR_CALL(getDeviceInfo(ToFreeAllocInfo->Device)
-                            ->Shadow->ReleaseShadow(ToFreeAllocInfo));
-            }
-
             UR_CALL(getContext()->urDdiTable.USM.pfnFree(
                 Context, (void *)(ToFreeAllocInfo->AllocBegin)));
 
@@ -436,12 +431,6 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
     auto ProgramInfo = getProgramInfo(Program);
     assert(ProgramInfo != nullptr && "unregistered program!");
 
-    for (auto AI : ProgramInfo->AllocInfoForGlobals) {
-        UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI));
-        m_AllocationMap.erase(AI->AllocBegin);
-    }
-    ProgramInfo->AllocInfoForGlobals.clear();
-
     ProgramInfo->InstrumentedKernels.clear();
 
     return UR_RESULT_SUCCESS;
@@ -560,10 +549,6 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
                           {}});
 
             ContextInfo->insertAllocInfo({Device}, AI);
-            ProgramInfo->AllocInfoForGlobals.emplace(AI);
-
-            std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
-            m_AllocationMap.emplace(AI->AllocBegin, std::move(AI));
         }
     }
 
@@ -887,9 +872,14 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
 ContextInfo::~ContextInfo() {
     Stats.Print(Handle);
 
-    [[maybe_unused]] auto Result =
-        getContext()->urDdiTable.Context.pfnRelease(Handle);
-    assert(Result == UR_RESULT_SUCCESS);
+    [[maybe_unused]] ur_result_t URes;
+    if (USMPool) {
+        URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool);
+        assert(URes == UR_RESULT_SUCCESS);
+    }
+
+    URes = getContext()->urDdiTable.Context.pfnRelease(Handle);
+    assert(URes == UR_RESULT_SUCCESS);
 
     // check memory leaks
     if (getAsanInterceptor()->getOptions().DetectLeaks &&
@@ -905,6 +895,22 @@ ContextInfo::~ContextInfo() {
     }
 }
 
+ur_usm_pool_handle_t ContextInfo::getUSMPool() {
+    std::call_once(PoolInit, [this]() {
+        ur_usm_pool_desc_t Desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr, 0};
+        auto URes =
+            getContext()->urDdiTable.USM.pfnPoolCreate(Handle, &Desc, &USMPool);
+        if (URes != UR_RESULT_SUCCESS &&
+            URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+            getContext()->logger.warning(
+                "Failed to create USM pool, the memory overhead "
+                "may increase: {}",
+                URes);
+        }
+    });
+    return USMPool;
+}
+
 AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() {
     [[maybe_unused]] ur_result_t Result;
     if (Host.LocalArgs) {
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index 122865bd11..f1e80dae56 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -112,7 +112,6 @@ struct ProgramInfo {
     std::atomic<int32_t> RefCount = 1;
 
     // Program is built only once, so we don't need to lock it
-    std::unordered_set<std::shared_ptr<AllocInfo>> AllocInfoForGlobals;
     std::unordered_set<std::string> InstrumentedKernels;
 
     explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) {
@@ -132,6 +131,10 @@ struct ProgramInfo {
 
 struct ContextInfo {
     ur_context_handle_t Handle;
+
+    ur_usm_pool_handle_t USMPool{};
+    std::once_flag PoolInit;
+
     std::atomic<int32_t> RefCount = 1;
 
     std::vector<ur_device_handle_t> DeviceList;
@@ -155,6 +158,8 @@ struct ContextInfo {
             AllocInfos.List.emplace_back(AI);
         }
     }
+
+    ur_usm_pool_handle_t getUSMPool();
 };
 
 struct AsanRuntimeDataWrapper {
diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.cpp b/source/loader/layers/sanitizer/asan/asan_shadow.cpp
index 8e2329ac06..de0679687b 100644
--- a/source/loader/layers/sanitizer/asan/asan_shadow.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_shadow.cpp
@@ -108,11 +108,15 @@ ur_result_t ShadowMemoryGPU::Setup() {
         // TODO: Protect Bad Zone
         auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
             Context, nullptr, ShadowSize, (void **)&ShadowBegin);
-        if (Result == UR_RESULT_SUCCESS) {
-            ShadowEnd = ShadowBegin + ShadowSize;
-            // Retain the context which reserves shadow memory
-            getContext()->urDdiTable.Context.pfnRetain(Context);
+        if (Result != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Shadow memory reserved failed with size {}: {}",
+                (void *)ShadowSize, Result);
+            return Result;
         }
+        ShadowEnd = ShadowBegin + ShadowSize;
+        // Retain the context which reserves shadow memory
+        getContext()->urDdiTable.Context.pfnRetain(Context);
 
         // Set shadow memory for null pointer
         // For GPU, wu use up to 1 page of shadow memory
@@ -137,6 +141,24 @@ ur_result_t ShadowMemoryGPU::Destory() {
             Context, (void *)PrivateShadowOffset));
         PrivateShadowOffset = 0;
     }
+
+    static ur_result_t Result = [this]() {
+        const size_t PageSize = GetVirtualMemGranularity(Context, Device);
+        for (auto [MappedPtr, PhysicalMem] : VirtualMemMaps) {
+            UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
+                Context, (void *)MappedPtr, PageSize));
+            UR_CALL(
+                getContext()->urDdiTable.PhysicalMem.pfnRelease(PhysicalMem));
+        }
+        UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree(
+            Context, (const void *)ShadowBegin, GetShadowSize()));
+        UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context));
+        return UR_RESULT_SUCCESS;
+    }();
+    if (!Result) {
+        return Result;
+    }
+
     if (LocalShadowOffset != 0) {
         UR_CALL(getContext()->urDdiTable.USM.pfnFree(
             Context, (void *)LocalShadowOffset));
@@ -205,19 +227,8 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
                     return URes;
                 }
 
-                VirtualMemMaps[MappedPtr].first = PhysicalMem;
+                VirtualMemMaps[MappedPtr] = PhysicalMem;
             }
-
-            // We don't need to record virtual memory map for null pointer,
-            // since it doesn't have an alloc info.
-            if (Ptr == 0) {
-                continue;
-            }
-
-            auto AllocInfoIt =
-                getAsanInterceptor()->findAllocInfoByAddress(Ptr);
-            assert(AllocInfoIt);
-            VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second);
         }
     }
 
@@ -235,35 +246,6 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
     return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
-    uptr ShadowBegin = MemToShadow(AI->AllocBegin);
-    uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize);
-    assert(ShadowBegin <= ShadowEnd);
-
-    static const size_t PageSize = GetVirtualMemGranularity(Context, Device);
-
-    for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
-         MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
-        std::scoped_lock<ur_mutex> Guard(VirtualMemMapsMutex);
-        if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) {
-            continue;
-        }
-        VirtualMemMaps[MappedPtr].second.erase(AI);
-        if (VirtualMemMaps[MappedPtr].second.empty()) {
-            UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
-                Context, (void *)MappedPtr, PageSize));
-            UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease(
-                VirtualMemMaps[MappedPtr].first));
-            getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
-                                       (void *)MappedPtr,
-                                       (void *)(MappedPtr + PageSize - 1));
-            VirtualMemMaps.erase(MappedPtr);
-        }
-    }
-
-    return UR_RESULT_SUCCESS;
-}
-
 ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
                                               uint32_t NumWG, uptr &Begin,
                                               uptr &End) {
diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.hpp b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
index 0658a07925..48054378fe 100644
--- a/source/loader/layers/sanitizer/asan/asan_shadow.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
@@ -35,10 +35,6 @@ struct ShadowMemory {
     virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
                                             uptr Size, u8 Value) = 0;
 
-    virtual ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo>) {
-        return UR_RESULT_SUCCESS;
-    }
-
     virtual size_t GetShadowSize() = 0;
 
     virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue,
@@ -98,8 +94,6 @@ struct ShadowMemoryGPU : public ShadowMemory {
     ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
                                     uptr Size, u8 Value) override final;
 
-    ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo> AI) override final;
-
     ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
                                  uptr &Begin, uptr &End) override final;
 
@@ -108,10 +102,7 @@ struct ShadowMemoryGPU : public ShadowMemory {
 
     ur_mutex VirtualMemMapsMutex;
 
-    std::unordered_map<
-        uptr, std::pair<ur_physical_mem_handle_t,
-                        std::unordered_set<std::shared_ptr<AllocInfo>>>>
-        VirtualMemMaps;
+    std::unordered_map<uptr, ur_physical_mem_handle_t> VirtualMemMaps;
 
     uptr LocalShadowOffset = 0;
 

From dd6b5ae548d002c76e19bd6f8c17e75d5f62b5c5 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Tue, 10 Dec 2024 15:26:04 +0100
Subject: [PATCH 097/148] [benchmarks] adjust label and legend positions on bar
 charts

Also add more info to tooltips.
---
 scripts/benchmarks/output_html.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 80524977e1..8bcda148b1 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -161,18 +161,27 @@ def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChar
                 if height > max_height:
                     max_height = height
 
-                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
+                ax.text(rect.get_x() + rect.get_width()/2., height + 1,
                                     f'{res.value:.1f}',
                                     ha='center', va='bottom', fontsize=9)
 
                 tooltip_labels = [
+                    f"Date: {res.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
                     f"Run: {run}\n"
                     f"Label: {res.label}\n"
                     f"Value: {res.value:.2f} {res.unit}\n"
+                    f"Stddev: {res.stddev:.2f} {res.unit}\n"
                 ]
                 tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css=tooltip_css())
                 mpld3.plugins.connect(ax.figure, tooltip)
 
+        # normally we'd just set legend to be outside
+        # the chart, but this is not supported by mpld3.
+        # instead, we adjust the y axis to account for
+        # the height of the bars.
+        legend_height = len(group.runs) * 0.1
+        ax.set_ylim(0, max_height * (1 + legend_height))
+
         ax.set_xticks([])
         ax.grid(True, axis='y', alpha=0.2)
         ax.set_ylabel(f"Value ({group.metadata.unit})")
@@ -190,7 +199,7 @@ def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChar
             # this is a hack to get labels to show above the legend
             # we normalize the idx to transAxes transform and offset it a little.
             x_norm = (idx + 0.3 - ax.get_xlim()[0]) / (ax.get_xlim()[1] - ax.get_xlim()[0])
-            ax.text(x_norm, 1.00, label,
+            ax.text(x_norm, 1.03, label,
                 transform=ax.transAxes,
                 color='#666666')
 

From 83439fe938f0f54a09e15e2058cee85231800de7 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 10 Dec 2024 17:27:24 +0000
Subject: [PATCH 098/148] [Tests] fix multi_device_event_cache_tests

Expected number of zeCommandListAppendWaitOnEvents calls is now 2:
a workaround for driver in-order lsits has been removed.
---
 test/adapters/level_zero/multi_device_event_cache_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/adapters/level_zero/multi_device_event_cache_tests.cpp b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
index 7848bb1003..6b8457fe37 100644
--- a/test/adapters/level_zero/multi_device_event_cache_tests.cpp
+++ b/test/adapters/level_zero/multi_device_event_cache_tests.cpp
@@ -110,7 +110,7 @@ TEST_F(urMultiQueueMultiDeviceEventCacheTest,
         urEnqueueEventsWait(queue1, 1, eventWaitDummy.ptr(), eventWait.ptr()));
     EXPECT_SUCCESS(
         urEnqueueEventsWait(queue2, 1, eventWait.ptr(), event.ptr()));
-    EXPECT_EQ(zeCommandListAppendWaitOnEventsCount, 3);
+    EXPECT_EQ(zeCommandListAppendWaitOnEventsCount, 2);
     ASSERT_SUCCESS(urEventRelease(eventWaitDummy.get()));
     ASSERT_SUCCESS(urEventRelease(eventWait.get()));
     ASSERT_SUCCESS(urEventRelease(event.get()));

From 941b1edfbc3472258f7fb823080b2f6dadfd8aff Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 10 Dec 2024 17:30:18 +0000
Subject: [PATCH 099/148] Enable level_zero_multi_queue test on CI

---
 .github/workflows/multi_device.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml
index 48a804bdf8..2abc32cb1e 100644
--- a/.github/workflows/multi_device.yml
+++ b/.github/workflows/multi_device.yml
@@ -58,8 +58,7 @@ jobs:
 
     - name: Test adapter specific
       working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" -E "test-adapter-level_zero_multi_queue" --timeout 180
-      # TODO: investigate why test-adapter-level_zero_multi_queue fails on newer driver
+      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
 
     - name: Test adapters
       working-directory: ${{github.workspace}}/build

From b532350f3136676c9516287dc3389fafa9006dc9 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Wed, 4 Dec 2024 11:03:00 +0100
Subject: [PATCH 100/148] [DevAsan] Report error when using unsupported
 extension API

Some extensions are not currently supported by device address sanitizer.
We may add support for those later, but for now, we will report errors
when using unsupported API to let users know instead of failing for
other random errors and puzzling the users.
---
 .../loader/layers/sanitizer/asan/asan_ddi.cpp | 223 ++++++++++++++++++
 test/layers/sanitizer/asan.cpp                |  63 +++++
 2 files changed, 286 insertions(+)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index a823ceba2d..73415c1bcf 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -1554,6 +1554,52 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urDeviceGetInfo
+__urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
+    ur_device_handle_t hDevice, ///< [in] handle of the device instance
+    ur_device_info_t propName,  ///< [in] type of the info to retrieve
+    size_t propSize, ///< [in] the number of bytes pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+                    ///< the info.
+    ///< If propSize is not equal to or greater than the real number of bytes
+    ///< needed to return the info
+    ///< then the ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+    ///< pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName.
+) {
+    auto pfnGetInfo = getContext()->urDdiTable.Device.pfnGetInfo;
+
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    // For unsupported features for device address sanitizer, we override the result.
+    static std::unordered_set<ur_device_info_t> UnsupportedFeatures = {
+        // Virtual Memory
+        UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT,
+
+        // Command Buffer
+        UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP,
+        UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP,
+    };
+    if (UnsupportedFeatures.find(propName) != UnsupportedFeatures.end()) {
+        UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+        // handle non-bool return type queries
+        if (propName == UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP) {
+            ur_device_command_buffer_update_capability_flags_t flag = 0;
+            return ReturnValue(flag);
+        }
+
+        return ReturnValue(false);
+    }
+
+    return pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Global table
 ///        with current process' addresses
@@ -1843,6 +1889,168 @@ __urdlllocal ur_result_t UR_APICALL urGetUSMProcAddrTable(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Device table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_device_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnGetInfo = ur_sanitizer_layer::asan::urDeviceGetInfo;
+
+    return result;
+}
+
+template <class A, class B> struct NotSupportedApi;
+
+template <class MsgType, class R, class... A>
+struct NotSupportedApi<MsgType, R (*)(A...)> {
+    R static ReportError(A...) {
+        getContext()->logger.error(MsgType::value);
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+};
+
+struct DevAsanNotSupportCommandBufferMsg {
+    static constexpr const char *value =
+        "CommandBuffer extension is not supported by UR_LAYER_ASAN";
+};
+
+struct DevAsanNotSupportVirtualMemoryMsg {
+    static constexpr const char *value =
+        "VirtualMemory extension is not supported by UR_LAYER_ASAN";
+};
+
+template <class T>
+using CommandBufferNotSupported =
+    NotSupportedApi<DevAsanNotSupportCommandBufferMsg, T>;
+
+template <class T>
+using VirtualMemoryNotSupported =
+    NotSupportedApi<DevAsanNotSupportVirtualMemoryMsg, T>;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's CommandBufferExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_command_buffer_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+#define SET_UNSUPPORTED(FuncPtr)                                               \
+    do {                                                                       \
+        FuncPtr = CommandBufferNotSupported<decltype(FuncPtr)>::ReportError;   \
+    } while (0)
+
+    SET_UNSUPPORTED(pDdiTable->pfnCreateExp);
+    SET_UNSUPPORTED(pDdiTable->pfnRetainExp);
+    SET_UNSUPPORTED(pDdiTable->pfnReleaseExp);
+    SET_UNSUPPORTED(pDdiTable->pfnFinalizeExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendKernelLaunchExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendUSMFillExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferCopyExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferWriteExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferReadExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferCopyRectExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferWriteRectExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferReadRectExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendMemBufferFillExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendUSMPrefetchExp);
+    SET_UNSUPPORTED(pDdiTable->pfnAppendUSMAdviseExp);
+    SET_UNSUPPORTED(pDdiTable->pfnEnqueueExp);
+    SET_UNSUPPORTED(pDdiTable->pfnRetainCommandExp);
+    SET_UNSUPPORTED(pDdiTable->pfnReleaseCommandExp);
+    SET_UNSUPPORTED(pDdiTable->pfnUpdateKernelLaunchExp);
+    SET_UNSUPPORTED(pDdiTable->pfnUpdateSignalEventExp);
+    SET_UNSUPPORTED(pDdiTable->pfnUpdateWaitEventsExp);
+    SET_UNSUPPORTED(pDdiTable->pfnGetInfoExp);
+    SET_UNSUPPORTED(pDdiTable->pfnCommandGetInfoExp);
+
+#undef SET_UNSUPPORTED
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's VirtualMem table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetVirtualMemProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_virtual_mem_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+#define SET_UNSUPPORTED(FuncPtr)                                               \
+    do {                                                                       \
+        FuncPtr = VirtualMemoryNotSupported<decltype(FuncPtr)>::ReportError;   \
+    } while (0)
+
+    SET_UNSUPPORTED(pDdiTable->pfnGranularityGetInfo);
+    SET_UNSUPPORTED(pDdiTable->pfnReserve);
+    SET_UNSUPPORTED(pDdiTable->pfnFree);
+    SET_UNSUPPORTED(pDdiTable->pfnMap);
+    SET_UNSUPPORTED(pDdiTable->pfnUnmap);
+    SET_UNSUPPORTED(pDdiTable->pfnSetAccess);
+    SET_UNSUPPORTED(pDdiTable->pfnGetInfo);
+
+#undef SET_UNSUPPORTED
+
+    return result;
+}
 } // namespace asan
 
 ur_result_t context_t::init(ur_dditable_t *dditable,
@@ -1911,6 +2119,21 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->USM);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::asan::urGetDeviceProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Device);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::asan::urGetCommandBufferExpProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->CommandBufferExp);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::asan::urGetVirtualMemProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->VirtualMem);
+    }
+
     return result;
 }
 
diff --git a/test/layers/sanitizer/asan.cpp b/test/layers/sanitizer/asan.cpp
index 0fbfe4cefe..02d368a9d0 100644
--- a/test/layers/sanitizer/asan.cpp
+++ b/test/layers/sanitizer/asan.cpp
@@ -56,3 +56,66 @@ TEST(DeviceAsan, Initialization) {
     status = urLoaderConfigRelease(loaderConfig);
     ASSERT_EQ(status, UR_RESULT_SUCCESS);
 }
+
+TEST(DeviceAsan, UnsupportedFeature) {
+    ur_result_t status;
+
+    ur_loader_config_handle_t loaderConfig;
+    status = urLoaderConfigCreate(&loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+    status = urLoaderConfigEnableLayer(loaderConfig, "UR_LAYER_ASAN");
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderInit(0, loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_adapter_handle_t adapter;
+    status = urAdapterGet(1, &adapter, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_platform_handle_t platform;
+    status = urPlatformGet(&adapter, 1, 1, &platform, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_device_handle_t device;
+    status = urDeviceGet(platform, UR_DEVICE_TYPE_DEFAULT, 1, &device, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    ur_context_handle_t context;
+    status = urContextCreate(1, &device, nullptr, &context);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    // Check for explict unsupported features
+    ur_bool_t isSupported;
+    status = urDeviceGetInfo(device, UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT,
+                             sizeof(isSupported), &isSupported, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+    ASSERT_EQ(isSupported, 0);
+
+    status = urDeviceGetInfo(device, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP,
+                             sizeof(isSupported), &isSupported, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+    ASSERT_EQ(isSupported, 0);
+
+    ur_device_command_buffer_update_capability_flags_t update_flag;
+    status = urDeviceGetInfo(
+        device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP,
+        sizeof(update_flag), &update_flag, nullptr);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+    ASSERT_EQ(update_flag, 0);
+
+    status = urContextRelease(context);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urDeviceRelease(device);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urAdapterRelease(adapter);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderTearDown();
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+
+    status = urLoaderConfigRelease(loaderConfig);
+    ASSERT_EQ(status, UR_RESULT_SUCCESS);
+}

From 08acf05f2d661421a14f71c3cd1f21daa1e34908 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 5 Dec 2024 16:07:01 -0800
Subject: [PATCH 101/148] [UR] Fix correct usage of In Order sync list given
 counting events

- Fixes a race condition when the wait events are counting events when
  executing the context's synchronous immediate command list.
- If a counting event is used in a non immediate command list, then an
  error will be returned.
- Given both InOrderLists and CounterBasedEvents is enabled, then the
  context init command list will not be inorder.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/context.cpp |  7 +++++++
 source/adapters/level_zero/device.cpp  | 14 ++++++++++++++
 source/adapters/level_zero/device.hpp  |  3 +++
 source/adapters/level_zero/queue.cpp   |  9 +--------
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index fd13dc35df..6abacecb6b 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -319,9 +319,16 @@ ur_result_t ur_context_handle_t_::initialize() {
 
   ZeCommandQueueDesc.index = 0;
   ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
+  if (Device->useDriverInOrderLists() &&
+      Device->useDriverCounterBasedEvents()) {
+    logger::debug(
+        "L0 Synchronous Immediate Command List needed with In Order property.");
+    ZeCommandQueueDesc.flags |= ZE_COMMAND_LIST_FLAG_IN_ORDER;
+  }
   ZE2UR_CALL(
       zeCommandListCreateImmediate,
       (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
+
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 34de05d2c1..b7422fe2cc 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1528,6 +1528,20 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
   return UseDriverInOrderLists;
 }
 
+bool ur_device_handle_t_::useDriverCounterBasedEvents() {
+  // Use counter-based events implementation from L0 driver.
+
+  static const bool DriverCounterBasedEventsEnabled = [] {
+    const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS");
+    if (!UrRet) {
+      return true;
+    }
+    return std::atoi(UrRet) != 0;
+  }();
+
+  return DriverCounterBasedEventsEnabled;
+}
+
 ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
                                             int SubSubDeviceIndex) {
   // Maintain various device properties cache.
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 512a5ff714..fb4c519c34 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -159,6 +159,9 @@ struct ur_device_handle_t_ : _ur_object {
   // Whether Adapter uses driver's implementation of in-order lists or not
   bool useDriverInOrderLists();
 
+  // Whether Adapter uses driver's implementation of counter-based events or not
+  bool useDriverCounterBasedEvents();
+
   // Returns whether immediate command lists are used on this device.
   ImmCmdlistMode ImmCommandListUsed{};
 
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 3167b141a7..43b5f16cd4 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1186,16 +1186,9 @@ ur_queue_handle_t_::ur_queue_handle_t_(
       ZeCommandListBatchComputeConfig.startSize();
   CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
 
-  static const bool useDriverCounterBasedEvents = [] {
-    const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS");
-    if (!UrRet) {
-      return true;
-    }
-    return std::atoi(UrRet) != 0;
-  }();
   this->CounterBasedEventsEnabled =
       UsingImmCmdLists && isInOrderQueue() && Device->useDriverInOrderLists() &&
-      useDriverCounterBasedEvents &&
+      Device->useDriverCounterBasedEvents() &&
       Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound;
   this->InterruptBasedEventsEnabled =
       isLowPowerEvents() && isInOrderQueue() && Device->useDriverInOrderLists();

From 064da157b3bb3a32354dc2e95a9bf59ea119d9db Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Wed, 23 Oct 2024 01:27:32 -0700
Subject: [PATCH 102/148] [DeviceMSAN] Support MemorySanitizer for device
 offloading

---
 source/loader/CMakeLists.txt                  |   16 +
 .../loader/layers/sanitizer/asan/asan_ddi.cpp |   25 +-
 .../loader/layers/sanitizer/asan/asan_ddi.hpp |    2 +
 .../sanitizer/asan/asan_interceptor.cpp       |  269 ++-
 .../layers/sanitizer/asan/asan_shadow.hpp     |    4 +-
 .../layers/sanitizer/msan/msan_allocator.cpp  |   26 +
 .../layers/sanitizer/msan/msan_allocator.hpp  |   41 +
 .../layers/sanitizer/msan/msan_buffer.cpp     |  204 +++
 .../layers/sanitizer/msan/msan_buffer.hpp     |   82 +
 .../loader/layers/sanitizer/msan/msan_ddi.cpp | 1528 +++++++++++++++++
 .../loader/layers/sanitizer/msan/msan_ddi.hpp |   22 +
 .../sanitizer/msan/msan_interceptor.cpp       |  490 ++++++
 .../sanitizer/msan/msan_interceptor.hpp       |  323 ++++
 .../layers/sanitizer/msan/msan_libdevice.hpp  |   66 +
 .../layers/sanitizer/msan/msan_options.cpp    |   90 +
 .../layers/sanitizer/msan/msan_options.hpp    |   27 +
 .../layers/sanitizer/msan/msan_report.cpp     |   43 +
 .../layers/sanitizer/msan/msan_report.hpp     |   27 +
 .../layers/sanitizer/msan/msan_shadow.cpp     |  291 ++++
 .../layers/sanitizer/msan/msan_shadow.hpp     |  144 ++
 .../linux/sanitizer_utils.cpp                 |   18 +
 .../sanitizer_common/sanitizer_common.hpp     |    2 +
 source/loader/layers/sanitizer/ur_sanddi.cpp  |   54 +
 .../layers/sanitizer/ur_sanitizer_layer.cpp   |    4 +
 24 files changed, 3640 insertions(+), 158 deletions(-)
 create mode 100644 source/loader/layers/sanitizer/msan/msan_allocator.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_allocator.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_buffer.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_buffer.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_ddi.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_ddi.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_interceptor.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_interceptor.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_libdevice.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_options.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_options.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_report.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_report.hpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_shadow.cpp
 create mode 100644 source/loader/layers/sanitizer/msan/msan_shadow.hpp
 create mode 100644 source/loader/layers/sanitizer/ur_sanddi.cpp

diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index aaca3b1569..d8f6056ae9 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -151,6 +151,21 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_statistics.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_validator.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_validator.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_allocator.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_allocator.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_buffer.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_buffer.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_ddi.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_ddi.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_interceptor.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_interceptor.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_libdevice.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_options.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_options.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_report.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_report.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/msan/msan_shadow.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/linux/backtrace.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/sanitizer_allocator.hpp
@@ -160,6 +175,7 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/sanitizer_stacktrace.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanddi.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanitizer_layer.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/ur_sanitizer_layer.hpp
     )
diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 73415c1bcf..f8ded3ec7a 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -2053,26 +2053,10 @@ __urdlllocal ur_result_t UR_APICALL urGetVirtualMemProcAddrTable(
 }
 } // namespace asan
 
-ur_result_t context_t::init(ur_dditable_t *dditable,
-                            const std::set<std::string> &enabledLayerNames,
-                            [[maybe_unused]] codeloc_data codelocData) {
+ur_result_t initAsanDDITable(ur_dditable_t *dditable) {
     ur_result_t result = UR_RESULT_SUCCESS;
 
-    if (enabledLayerNames.count("UR_LAYER_ASAN")) {
-        enabledType = SanitizerType::AddressSanitizer;
-        initAsanInterceptor();
-    } else if (enabledLayerNames.count("UR_LAYER_MSAN")) {
-        enabledType = SanitizerType::MemorySanitizer;
-    } else if (enabledLayerNames.count("UR_LAYER_TSAN")) {
-        enabledType = SanitizerType::ThreadSanitizer;
-    }
-
-    // Only support AddressSanitizer now
-    if (enabledType != SanitizerType::AddressSanitizer) {
-        return result;
-    }
-
-    urDdiTable = *dditable;
+    getContext()->logger.always("==== DeviceSanitizer: ASAN");
 
     if (UR_RESULT_SUCCESS == result) {
         result = ur_sanitizer_layer::asan::urGetGlobalProcAddrTable(
@@ -2134,6 +2118,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->VirtualMem);
     }
 
+    if (result != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Initialize ASAN DDI table failed: {}",
+                                   result);
+    }
+
     return result;
 }
 
diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.hpp b/source/loader/layers/sanitizer/asan/asan_ddi.hpp
index 735c4409d8..fe67d3d6bf 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.hpp
@@ -17,4 +17,6 @@ namespace ur_sanitizer_layer {
 void initAsanInterceptor();
 void destroyAsanInterceptor();
 
+ur_result_t initAsanDDITable(ur_dditable_t *dditable);
+
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 1a1185e1ba..19af8546c2 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -676,162 +676,155 @@ ur_result_t AsanInterceptor::prepareLaunch(
     std::shared_ptr<DeviceInfo> &DeviceInfo, ur_queue_handle_t Queue,
     ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) {
 
-    do {
-        auto KernelInfo = getKernelInfo(Kernel);
-        assert(KernelInfo && "Kernel should be instrumented");
-
-        // Validate pointer arguments
-        if (getOptions().DetectKernelArguments) {
-            for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
-                auto Ptr = PtrPair.first;
-                if (Ptr == nullptr) {
-                    continue;
-                }
-                if (auto ValidateResult = ValidateUSMPointer(
-                        ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
-                    ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
-                                                ValidateResult, PtrPair.second);
-                    exitWithErrors();
-                }
+    auto KernelInfo = getKernelInfo(Kernel);
+    assert(KernelInfo && "Kernel should be instrumented");
+
+    // Validate pointer arguments
+    if (getOptions().DetectKernelArguments) {
+        for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
+            auto Ptr = PtrPair.first;
+            if (Ptr == nullptr) {
+                continue;
+            }
+            if (auto ValidateResult = ValidateUSMPointer(
+                    ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
+                ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
+                                            ValidateResult, PtrPair.second);
+                exitWithErrors();
             }
         }
+    }
 
-        // Set membuffer arguments
-        for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
-            char *ArgPointer = nullptr;
-            UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgIndex, nullptr, ArgPointer);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error(
-                    "Failed to set buffer {} as the {} arg to kernel {}: {}",
-                    ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
-                    URes);
-            }
+    // Set membuffer arguments
+    for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+        char *ArgPointer = nullptr;
+        UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgIndex, nullptr, ArgPointer);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                URes);
         }
+    }
 
-        auto ArgNums = GetKernelNumArgs(Kernel);
-        // We must prepare all kernel args before call
-        // urKernelGetSuggestedLocalWorkSize, otherwise the call will fail on
-        // CPU device.
-        if (ArgNums) {
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr());
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error("Failed to set launch info: {}",
-                                           URes);
-                return URes;
-            }
+    auto ArgNums = GetKernelNumArgs(Kernel);
+    // We must prepare all kernel args before call
+    // urKernelGetSuggestedLocalWorkSize, otherwise the call will fail on
+    // CPU device.
+    if (ArgNums) {
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr());
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to set launch info: {}", URes);
+            return URes;
         }
+    }
 
-        if (LaunchInfo.LocalWorkSize.empty()) {
-            LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-            auto URes =
-                getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-                    Kernel, Queue, LaunchInfo.WorkDim,
-                    LaunchInfo.GlobalWorkOffset, LaunchInfo.GlobalWorkSize,
-                    LaunchInfo.LocalWorkSize.data());
-            if (URes != UR_RESULT_SUCCESS) {
-                if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
-                    return URes;
-                }
-                // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
-                // to inefficient implementation
-                for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-                    LaunchInfo.LocalWorkSize[Dim] = 1;
-                }
+    if (LaunchInfo.LocalWorkSize.empty()) {
+        LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
+        auto URes =
+            getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
+                Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset,
+                LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+        if (URes != UR_RESULT_SUCCESS) {
+            if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+                return URes;
+            }
+            // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
+            // to inefficient implementation
+            for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+                LaunchInfo.LocalWorkSize[Dim] = 1;
             }
         }
+    }
 
-        const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
-        uint32_t NumWG = 1;
-        for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-            NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
-                     LocalWorkSize[Dim];
-        }
+    const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
+    uint32_t NumWG = 1;
+    for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+        NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
+                 LocalWorkSize[Dim];
+    }
 
-        // Prepare asan runtime data
-        LaunchInfo.Data.Host.GlobalShadowOffset =
-            DeviceInfo->Shadow->ShadowBegin;
-        LaunchInfo.Data.Host.GlobalShadowOffsetEnd =
-            DeviceInfo->Shadow->ShadowEnd;
-        LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type;
-        LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0;
-
-        auto LocalMemoryUsage =
-            GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
-        auto PrivateMemoryUsage =
-            GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
-
-        getContext()->logger.info(
-            "KernelInfo {} (LocalMemory={}, PrivateMemory={})", (void *)Kernel,
-            LocalMemoryUsage, PrivateMemoryUsage);
-
-        // Write shadow memory offset for local memory
-        if (getOptions().DetectLocals) {
-            if (DeviceInfo->Shadow->AllocLocalShadow(
-                    Queue, NumWG, LaunchInfo.Data.Host.LocalShadowOffset,
-                    LaunchInfo.Data.Host.LocalShadowOffsetEnd) !=
-                UR_RESULT_SUCCESS) {
-                getContext()->logger.warning(
-                    "Failed to allocate shadow memory for local "
-                    "memory, maybe the number of workgroup ({}) is too "
-                    "large",
-                    NumWG);
-                getContext()->logger.warning(
-                    "Skip checking local memory of kernel <{}>",
-                    GetKernelName(Kernel));
-            } else {
-                getContext()->logger.info(
-                    "ShadowMemory(Local, WorkGroup{}, {} - {})", NumWG,
-                    (void *)LaunchInfo.Data.Host.LocalShadowOffset,
-                    (void *)LaunchInfo.Data.Host.LocalShadowOffsetEnd);
-            }
+    // Prepare asan runtime data
+    LaunchInfo.Data.Host.GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
+    LaunchInfo.Data.Host.GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+    LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type;
+    LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0;
+
+    auto LocalMemoryUsage =
+        GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
+    auto PrivateMemoryUsage =
+        GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
+
+    getContext()->logger.info(
+        "KernelInfo {} (LocalMemory={}, PrivateMemory={})", (void *)Kernel,
+        LocalMemoryUsage, PrivateMemoryUsage);
+
+    // Write shadow memory offset for local memory
+    if (getOptions().DetectLocals) {
+        if (DeviceInfo->Shadow->AllocLocalShadow(
+                Queue, NumWG, LaunchInfo.Data.Host.LocalShadowOffset,
+                LaunchInfo.Data.Host.LocalShadowOffsetEnd) !=
+            UR_RESULT_SUCCESS) {
+            getContext()->logger.warning(
+                "Failed to allocate shadow memory for local "
+                "memory, maybe the number of workgroup ({}) is too "
+                "large",
+                NumWG);
+            getContext()->logger.warning(
+                "Skip checking local memory of kernel <{}>",
+                GetKernelName(Kernel));
+        } else {
+            getContext()->logger.info(
+                "ShadowMemory(Local, WorkGroup{}, {} - {})", NumWG,
+                (void *)LaunchInfo.Data.Host.LocalShadowOffset,
+                (void *)LaunchInfo.Data.Host.LocalShadowOffsetEnd);
         }
+    }
 
-        // Write shadow memory offset for private memory
-        if (getOptions().DetectPrivates) {
-            if (DeviceInfo->Shadow->AllocPrivateShadow(
-                    Queue, NumWG, LaunchInfo.Data.Host.PrivateShadowOffset,
-                    LaunchInfo.Data.Host.PrivateShadowOffsetEnd) !=
-                UR_RESULT_SUCCESS) {
-                getContext()->logger.warning(
-                    "Failed to allocate shadow memory for private "
-                    "memory, maybe the number of workgroup ({}) is too "
-                    "large",
-                    NumWG);
-                getContext()->logger.warning(
-                    "Skip checking private memory of kernel <{}>",
-                    GetKernelName(Kernel));
-            } else {
-                getContext()->logger.info(
-                    "ShadowMemory(Private, WorkGroup{}, {} - {})", NumWG,
-                    (void *)LaunchInfo.Data.Host.PrivateShadowOffset,
-                    (void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd);
-            }
+    // Write shadow memory offset for private memory
+    if (getOptions().DetectPrivates) {
+        if (DeviceInfo->Shadow->AllocPrivateShadow(
+                Queue, NumWG, LaunchInfo.Data.Host.PrivateShadowOffset,
+                LaunchInfo.Data.Host.PrivateShadowOffsetEnd) !=
+            UR_RESULT_SUCCESS) {
+            getContext()->logger.warning(
+                "Failed to allocate shadow memory for private "
+                "memory, maybe the number of workgroup ({}) is too "
+                "large",
+                NumWG);
+            getContext()->logger.warning(
+                "Skip checking private memory of kernel <{}>",
+                GetKernelName(Kernel));
+        } else {
+            getContext()->logger.info(
+                "ShadowMemory(Private, WorkGroup{}, {} - {})", NumWG,
+                (void *)LaunchInfo.Data.Host.PrivateShadowOffset,
+                (void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd);
         }
+    }
 
-        // Write local arguments info
-        if (!KernelInfo->LocalArgs.empty()) {
-            std::vector<LocalArgsInfo> LocalArgsInfo;
-            for (auto [ArgIndex, ArgInfo] : KernelInfo->LocalArgs) {
-                LocalArgsInfo.push_back(ArgInfo);
-                getContext()->logger.debug(
-                    "local_args (argIndex={}, size={}, sizeWithRZ={})",
-                    ArgIndex, ArgInfo.Size, ArgInfo.SizeWithRedZone);
-            }
-            UR_CALL(LaunchInfo.Data.importLocalArgsInfo(Queue, LocalArgsInfo));
+    // Write local arguments info
+    if (!KernelInfo->LocalArgs.empty()) {
+        std::vector<LocalArgsInfo> LocalArgsInfo;
+        for (auto [ArgIndex, ArgInfo] : KernelInfo->LocalArgs) {
+            LocalArgsInfo.push_back(ArgInfo);
+            getContext()->logger.debug(
+                "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex,
+                ArgInfo.Size, ArgInfo.SizeWithRedZone);
         }
+        UR_CALL(LaunchInfo.Data.importLocalArgsInfo(Queue, LocalArgsInfo));
+    }
 
-        // sync asan runtime data to device side
-        UR_CALL(LaunchInfo.Data.syncToDevice(Queue));
+    // sync asan runtime data to device side
+    UR_CALL(LaunchInfo.Data.syncToDevice(Queue));
 
-        getContext()->logger.debug(
-            "launch_info {} (numLocalArgs={}, localArgs={})",
-            (void *)LaunchInfo.Data.getDevicePtr(),
-            LaunchInfo.Data.Host.NumLocalArgs,
-            (void *)LaunchInfo.Data.Host.LocalArgs);
-    } while (false);
+    getContext()->logger.debug("launch_info {} (numLocalArgs={}, localArgs={})",
+                               (void *)LaunchInfo.Data.getDevicePtr(),
+                               LaunchInfo.Data.Host.NumLocalArgs,
+                               (void *)LaunchInfo.Data.Host.LocalArgs);
 
     return UR_RESULT_SUCCESS;
 }
diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.hpp b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
index 48054378fe..76abb7e35c 100644
--- a/source/loader/layers/sanitizer/asan/asan_shadow.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include "asan/asan_allocator.hpp"
+#include "asan_allocator.hpp"
 #include "sanitizer_common/sanitizer_libdevice.hpp"
 
 #include <unordered_set>
@@ -134,7 +134,7 @@ struct ShadowMemoryPVC final : public ShadowMemoryGPU {
     size_t GetShadowSize() override { return 0x180000000000ULL; }
 };
 
-/// Shadow Memory layout of GPU PVC device
+/// Shadow Memory layout of GPU DG2 device
 ///
 /// USM Allocation Range (48 bits)
 ///   Host/Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff
diff --git a/source/loader/layers/sanitizer/msan/msan_allocator.cpp b/source/loader/layers/sanitizer/msan/msan_allocator.cpp
new file mode 100644
index 0000000000..e0213c26b5
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_allocator.cpp
@@ -0,0 +1,26 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_allocator.cpp
+ *
+ */
+
+#include "msan_allocator.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+void MsanAllocInfo::print() {
+    getContext()->logger.info("AllocInfo(Alloc=[{}-{}), AllocSize={})",
+                              (void *)AllocBegin,
+                              (void *)(AllocBegin + AllocSize), AllocSize);
+}
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_allocator.hpp b/source/loader/layers/sanitizer/msan/msan_allocator.hpp
new file mode 100644
index 0000000000..32b85e6945
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_allocator.hpp
@@ -0,0 +1,41 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_allocator.hpp
+ *
+ */
+
+#pragma once
+
+#include "sanitizer_common/sanitizer_allocator.hpp"
+#include "sanitizer_common/sanitizer_common.hpp"
+#include "sanitizer_common/sanitizer_stacktrace.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+struct MsanAllocInfo {
+    uptr AllocBegin = 0;
+    size_t AllocSize = 0;
+
+    bool IsReleased = false;
+
+    ur_context_handle_t Context = nullptr;
+    ur_device_handle_t Device = nullptr;
+
+    StackTrace AllocStack;
+    StackTrace ReleaseStack;
+
+    void print();
+};
+
+using MsanAllocationMap = std::map<uptr, std::shared_ptr<MsanAllocInfo>>;
+using MsanAllocationIterator = MsanAllocationMap::iterator;
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_buffer.cpp b/source/loader/layers/sanitizer/msan/msan_buffer.cpp
new file mode 100644
index 0000000000..66ebb10326
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_buffer.cpp
@@ -0,0 +1,204 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_buffer.cpp
+ *
+ */
+
+#include "msan_buffer.hpp"
+#include "msan_interceptor.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
+    // If user doesn't determine src/dst row pitch and slice pitch, just use
+    // region for it.
+    if (SrcRowPitch == 0) {
+        SrcRowPitch = Region.width;
+    }
+
+    if (SrcSlicePitch == 0) {
+        SrcSlicePitch = SrcRowPitch * Region.height;
+    }
+
+    if (DstRowPitch == 0) {
+        DstRowPitch = Region.width;
+    }
+
+    if (DstSlicePitch == 0) {
+        DstSlicePitch = DstRowPitch * Region.height;
+    }
+
+    // Calculate the src and dst addresses that actually will be copied.
+    char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
+                      SrcSlicePitch * SrcOffset.z;
+    char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
+                      DstSlicePitch * DstOffset.z;
+
+    std::vector<ur_event_handle_t> Events;
+    Events.reserve(Region.depth);
+    // For now, USM doesn't support 3D memory copy operation, so we can only
+    // loop call 2D memory copy function to implement it.
+    for (size_t i = 0; i < Region.depth; i++) {
+        ur_event_handle_t NewEvent{};
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D(
+            Queue, Blocking, DstOrigin + (i * DstSlicePitch), DstRowPitch,
+            SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
+            Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
+
+        Events.push_back(NewEvent);
+    }
+
+    UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait(
+        Queue, Events.size(), Events.data(), Event));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
+    // Sub-buffers don't maintain own allocations but rely on parent buffer.
+    if (SubBuffer) {
+        UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
+        Handle += SubBuffer->Origin;
+        return UR_RESULT_SUCCESS;
+    }
+
+    // Device may be null, we follow the L0 adapter's practice to use the first
+    // device
+    if (!Device) {
+        auto Devices = GetDevices(Context);
+        assert(Devices.size() > 0 && "Devices should not be empty");
+        Device = Devices[0];
+    }
+    assert((void *)Device != nullptr && "Device cannot be nullptr");
+
+    std::scoped_lock<ur_shared_mutex> Guard(Mutex);
+    auto &Allocation = Allocations[Device];
+    ur_result_t URes = UR_RESULT_SUCCESS;
+    if (!Allocation) {
+        ur_usm_desc_t USMDesc{};
+        USMDesc.align = getAlignment();
+        ur_usm_pool_handle_t Pool{};
+        URes = getMsanInterceptor()->allocateMemory(
+            Context, Device, &USMDesc, Pool, Size,
+            ur_cast<void **>(&Allocation));
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Failed to allocate {} bytes memory for buffer {}", Size, this);
+            return URes;
+        }
+
+        if (HostPtr) {
+            ManagedQueue Queue(Context, Device);
+            URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to copy {} bytes data from host "
+                    "pointer {} to buffer {}",
+                    Size, HostPtr, this);
+                return URes;
+            }
+        }
+    }
+
+    Handle = Allocation;
+
+    if (!LastSyncedDevice.hDevice) {
+        LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+        return URes;
+    }
+
+    // If the device required to allocate memory is not the previous one, we
+    // need to do data migration.
+    if (Device != LastSyncedDevice.hDevice) {
+        auto &HostAllocation = Allocations[nullptr];
+        if (!HostAllocation) {
+            ur_usm_desc_t USMDesc{};
+            USMDesc.align = getAlignment();
+            ur_usm_pool_handle_t Pool{};
+            URes = getMsanInterceptor()->allocateMemory(
+                Context, nullptr, &USMDesc, Pool, Size,
+                ur_cast<void **>(&HostAllocation));
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error("Failed to allocate {} bytes host "
+                                           "USM for buffer {} migration",
+                                           Size, this);
+                return URes;
+            }
+        }
+
+        // Copy data from last synced device to host
+        {
+            ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
+            URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size,
+                0, nullptr, nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to migrate memory buffer data");
+                return URes;
+            }
+        }
+
+        // Sync data back to device
+        {
+            ManagedQueue Queue(Context, Device);
+            URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, Allocation, HostAllocation, Size, 0, nullptr,
+                nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to migrate memory buffer data");
+                return URes;
+            }
+        }
+    }
+
+    LastSyncedDevice = MemBuffer::Device_t{Device, Handle};
+
+    return URes;
+}
+
+ur_result_t MemBuffer::free() {
+    for (const auto &[_, Ptr] : Allocations) {
+        ur_result_t URes = getContext()->urDdiTable.USM.pfnFree(Context, Ptr);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to free buffer handle {}", Ptr);
+            return URes;
+        }
+    }
+    Allocations.clear();
+    return UR_RESULT_SUCCESS;
+}
+
+size_t MemBuffer::getAlignment() {
+    // Choose an alignment that is at most 128 and is the next power of 2
+    // for sizes less than 128.
+    // TODO: If we don't set the alignment size explicitly, the device will
+    // usually choose a very large size (more than 1k). Then sanitizer will
+    // allocate extra unnessary memory. Not sure if this will impact
+    // performance.
+    size_t MsbIdx = 63 - __builtin_clzl(Size);
+    size_t Alignment = (1ULL << (MsbIdx + 1));
+    if (Alignment > 128) {
+        Alignment = 128;
+    }
+    return Alignment;
+}
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_buffer.hpp b/source/loader/layers/sanitizer/msan/msan_buffer.hpp
new file mode 100644
index 0000000000..e953ac3e66
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_buffer.hpp
@@ -0,0 +1,82 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_buffer.hpp
+ *
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <optional>
+
+#include "ur/ur.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+struct MemBuffer {
+    // Buffer constructor
+    MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
+        : Context(Context), Size(Size), HostPtr(HostPtr) {}
+
+    // Sub-buffer constructor
+    MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
+        : Context(Parent->Context), Size(Size), SubBuffer{{Parent, Origin}} {}
+
+    ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);
+
+    ur_result_t free();
+
+    size_t getAlignment();
+
+    std::unordered_map<ur_device_handle_t, char *> Allocations;
+
+    enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };
+
+    struct Mapping {
+        size_t Offset;
+        size_t Size;
+    };
+
+    std::unordered_map<void *, Mapping> Mappings;
+
+    ur_context_handle_t Context;
+
+    struct Device_t {
+        ur_device_handle_t hDevice;
+        char *MemHandle;
+    };
+    Device_t LastSyncedDevice{};
+
+    size_t Size;
+
+    char *HostPtr{};
+
+    struct SubBuffer_t {
+        std::shared_ptr<MemBuffer> Parent;
+        size_t Origin;
+    };
+
+    std::optional<SubBuffer_t> SubBuffer;
+
+    std::atomic<int32_t> RefCount = 1;
+
+    ur_shared_mutex Mutex;
+};
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/source/loader/layers/sanitizer/msan/msan_ddi.cpp
new file mode 100644
index 0000000000..87438a1f99
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_ddi.cpp
@@ -0,0 +1,1528 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_ddi.cpp
+ *
+ */
+
+#include "msan_ddi.hpp"
+#include "msan_interceptor.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+#include <memory>
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+namespace {
+
+ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
+                         const ur_device_handle_t *phDevices) {
+    std::shared_ptr<ContextInfo> CI;
+    UR_CALL(getMsanInterceptor()->insertContext(Context, CI));
+    for (uint32_t i = 0; i < numDevices; ++i) {
+        auto hDevice = phDevices[i];
+        std::shared_ptr<DeviceInfo> DI;
+        UR_CALL(getMsanInterceptor()->insertDevice(hDevice, DI));
+        DI->Type = GetDeviceType(Context, hDevice);
+        if (DI->Type == DeviceType::UNKNOWN) {
+            getContext()->logger.error("Unsupport device");
+            return UR_RESULT_ERROR_INVALID_DEVICE;
+        }
+        getContext()->logger.info(
+            "DeviceInfo {} (Type={}, IsSupportSharedSystemUSM={})",
+            (void *)DI->Handle, ToString(DI->Type),
+            DI->IsSupportSharedSystemUSM);
+        getContext()->logger.info("Add {} into context {}", (void *)DI->Handle,
+                                  (void *)Context);
+        if (!DI->Shadow) {
+            UR_CALL(DI->allocShadowMemory(Context));
+        }
+        CI->DeviceList.emplace_back(hDevice);
+        CI->AllocInfosMap[hDevice];
+    }
+    return UR_RESULT_SUCCESS;
+}
+
+bool isInstrumentedKernel(ur_kernel_handle_t hKernel) {
+    auto hProgram = GetProgram(hKernel);
+    auto PI = getMsanInterceptor()->getProgramInfo(hProgram);
+    return PI->isKernelInstrumented(hKernel);
+}
+
+} // namespace
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urAdapterGet
+ur_result_t urAdapterGet(
+    uint32_t
+        NumEntries, ///< [in] the number of adapters to be added to phAdapters.
+    ///< If phAdapters is not NULL, then NumEntries should be greater than
+    ///< zero, otherwise ::UR_RESULT_ERROR_INVALID_SIZE,
+    ///< will be returned.
+    ur_adapter_handle_t *
+        phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters.
+    ///< If NumEntries is less than the number of adapters available, then
+    ///< ::urAdapterGet shall only retrieve that number of platforms.
+    uint32_t *
+        pNumAdapters ///< [out][optional] returns the total number of adapters available.
+) {
+    auto pfnAdapterGet = getContext()->urDdiTable.Global.pfnAdapterGet;
+
+    // FIXME: This is a W/A to disable heap extended for MSAN so that we can reserve large VA of GPU.
+    setenv("NEOReadDebugKeys", "1", 1);
+    setenv("AllocateHostAllocationsInHeapExtendedHost", "0", 1);
+    setenv("UseHighAlignmentForHeapExtended", "0", 1);
+
+    ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters);
+    if (result == UR_RESULT_SUCCESS && phAdapters) {
+        const uint32_t NumAdapters = pNumAdapters ? *pNumAdapters : NumEntries;
+        for (uint32_t i = 0; i < NumAdapters; ++i) {
+            UR_CALL(getMsanInterceptor()->holdAdapter(phAdapters[i]));
+        }
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urUSMDeviceAlloc
+ur_result_t urUSMDeviceAlloc(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    const ur_usm_desc_t
+        *pUSMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t
+        pool, ///< [in][optional] Pointer to a pool created using urUSMPoolCreate
+    size_t
+        size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **ppMem ///< [out] pointer to USM device memory object
+) {
+    getContext()->logger.debug("==== urUSMDeviceAlloc");
+
+    return getMsanInterceptor()->allocateMemory(hContext, hDevice, pUSMDesc,
+                                                pool, size, ppMem);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramCreateWithIL
+ur_result_t urProgramCreateWithIL(
+    ur_context_handle_t hContext, ///< [in] handle of the context instance
+    const void *pIL,              ///< [in] pointer to IL binary.
+    size_t length,                ///< [in] length of `pIL` in bytes.
+    const ur_program_properties_t *
+        pProperties, ///< [in][optional] pointer to program creation properties.
+    ur_program_handle_t
+        *phProgram ///< [out] pointer to handle of program object created.
+) {
+    auto pfnProgramCreateWithIL =
+        getContext()->urDdiTable.Program.pfnCreateWithIL;
+
+    getContext()->logger.debug("==== urProgramCreateWithIL");
+
+    UR_CALL(
+        pfnProgramCreateWithIL(hContext, pIL, length, pProperties, phProgram));
+    UR_CALL(getMsanInterceptor()->insertProgram(*phProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramCreateWithBinary
+ur_result_t urProgramCreateWithBinary(
+    ur_context_handle_t hContext, ///< [in] handle of the context instance
+    uint32_t numDevices,          ///< [in] number of devices
+    ur_device_handle_t *
+        phDevices, ///< [in][range(0, numDevices)] a pointer to a list of device handles. The
+                   ///< binaries are loaded for devices specified in this list.
+    size_t *
+        pLengths, ///< [in][range(0, numDevices)] array of sizes of program binaries
+                  ///< specified by `pBinaries` (in bytes).
+    const uint8_t **
+        ppBinaries, ///< [in][range(0, numDevices)] pointer to program binaries to be loaded
+                    ///< for devices specified by `phDevices`.
+    const ur_program_properties_t *
+        pProperties, ///< [in][optional] pointer to program creation properties.
+    ur_program_handle_t
+        *phProgram ///< [out] pointer to handle of Program object created.
+) {
+    auto pfnProgramCreateWithBinary =
+        getContext()->urDdiTable.Program.pfnCreateWithBinary;
+
+    getContext()->logger.debug("==== urProgramCreateWithBinary");
+
+    UR_CALL(pfnProgramCreateWithBinary(hContext, numDevices, phDevices,
+                                       pLengths, ppBinaries, pProperties,
+                                       phProgram));
+    UR_CALL(getMsanInterceptor()->insertProgram(*phProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramCreateWithNativeHandle
+ur_result_t urProgramCreateWithNativeHandle(
+    ur_native_handle_t
+        hNativeProgram, ///< [in][nocheck] the native handle of the program.
+    ur_context_handle_t hContext, ///< [in] handle of the context instance
+    const ur_program_native_properties_t *
+        pProperties, ///< [in][optional] pointer to native program properties struct.
+    ur_program_handle_t *
+        phProgram ///< [out] pointer to the handle of the program object created.
+) {
+    auto pfnProgramCreateWithNativeHandle =
+        getContext()->urDdiTable.Program.pfnCreateWithNativeHandle;
+
+    getContext()->logger.debug("==== urProgramCreateWithNativeHandle");
+
+    UR_CALL(pfnProgramCreateWithNativeHandle(hNativeProgram, hContext,
+                                             pProperties, phProgram));
+    UR_CALL(getMsanInterceptor()->insertProgram(*phProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramRetain
+ur_result_t
+urProgramRetain(ur_program_handle_t
+                    hProgram ///< [in][retain] handle for the Program to retain
+) {
+    auto pfnRetain = getContext()->urDdiTable.Program.pfnRetain;
+
+    getContext()->logger.debug("==== urProgramRetain");
+
+    UR_CALL(pfnRetain(hProgram));
+
+    auto ProgramInfo = getMsanInterceptor()->getProgramInfo(hProgram);
+    UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
+    ProgramInfo->RefCount++;
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramBuild
+ur_result_t urProgramBuild(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_program_handle_t hProgram, ///< [in] handle of the program object
+    const char *pOptions          ///< [in] string of build options
+) {
+    auto pfnProgramBuild = getContext()->urDdiTable.Program.pfnBuild;
+
+    getContext()->logger.debug("==== urProgramBuild");
+
+    UR_CALL(pfnProgramBuild(hContext, hProgram, pOptions));
+
+    UR_CALL(getMsanInterceptor()->registerProgram(hProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramBuildExp
+ur_result_t urProgramBuildExp(
+    ur_program_handle_t hProgram, ///< [in] Handle of the program to build.
+    uint32_t numDevices,          ///< [in] number of devices
+    ur_device_handle_t *
+        phDevices, ///< [in][range(0, numDevices)] pointer to array of device handles
+    const char *
+        pOptions ///< [in][optional] pointer to build options null-terminated string.
+) {
+    auto pfnBuildExp = getContext()->urDdiTable.ProgramExp.pfnBuildExp;
+
+    getContext()->logger.debug("==== urProgramBuildExp");
+
+    UR_CALL(pfnBuildExp(hProgram, numDevices, phDevices, pOptions));
+    UR_CALL(getMsanInterceptor()->registerProgram(hProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramLink
+ur_result_t urProgramLink(
+    ur_context_handle_t hContext, ///< [in] handle of the context instance.
+    uint32_t count, ///< [in] number of program handles in `phPrograms`.
+    const ur_program_handle_t *
+        phPrograms, ///< [in][range(0, count)] pointer to array of program handles.
+    const char *
+        pOptions, ///< [in][optional] pointer to linker options null-terminated string.
+    ur_program_handle_t
+        *phProgram ///< [out] pointer to handle of program object created.
+) {
+    auto pfnProgramLink = getContext()->urDdiTable.Program.pfnLink;
+
+    getContext()->logger.debug("==== urProgramLink");
+
+    UR_CALL(pfnProgramLink(hContext, count, phPrograms, pOptions, phProgram));
+
+    UR_CALL(getMsanInterceptor()->insertProgram(*phProgram));
+    UR_CALL(getMsanInterceptor()->registerProgram(*phProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramLinkExp
+ur_result_t urProgramLinkExp(
+    ur_context_handle_t hContext, ///< [in] handle of the context instance.
+    uint32_t numDevices,          ///< [in] number of devices
+    ur_device_handle_t *
+        phDevices, ///< [in][range(0, numDevices)] pointer to array of device handles
+    uint32_t count, ///< [in] number of program handles in `phPrograms`.
+    const ur_program_handle_t *
+        phPrograms, ///< [in][range(0, count)] pointer to array of program handles.
+    const char *
+        pOptions, ///< [in][optional] pointer to linker options null-terminated string.
+    ur_program_handle_t
+        *phProgram ///< [out] pointer to handle of program object created.
+) {
+    auto pfnProgramLinkExp = getContext()->urDdiTable.ProgramExp.pfnLinkExp;
+
+    getContext()->logger.debug("==== urProgramLinkExp");
+
+    UR_CALL(pfnProgramLinkExp(hContext, numDevices, phDevices, count,
+                              phPrograms, pOptions, phProgram));
+
+    UR_CALL(getMsanInterceptor()->insertProgram(*phProgram));
+    UR_CALL(getMsanInterceptor()->registerProgram(*phProgram));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urProgramRelease
+ur_result_t urProgramRelease(
+    ur_program_handle_t
+        hProgram ///< [in][release] handle for the Program to release
+) {
+    auto pfnProgramRelease = getContext()->urDdiTable.Program.pfnRelease;
+
+    getContext()->logger.debug("==== urProgramRelease");
+
+    UR_CALL(pfnProgramRelease(hProgram));
+
+    auto ProgramInfo = getMsanInterceptor()->getProgramInfo(hProgram);
+    UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
+    if (--ProgramInfo->RefCount == 0) {
+        UR_CALL(getMsanInterceptor()->unregisterProgram(hProgram));
+        UR_CALL(getMsanInterceptor()->eraseProgram(hProgram));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunch
+ur_result_t urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t
+        workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
+                 ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< number of global work-items in workDim that will execute the kernel
+    ///< function
+    const size_t *
+        pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that
+    ///< specify the number of local work-items forming a work-group that will
+    ///< execute the kernel function.
+    ///< If nullptr, the runtime implementation will choose the work-group
+    ///< size.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    ///< event.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< kernel execution instance.
+) {
+    auto pfnKernelLaunch = getContext()->urDdiTable.Enqueue.pfnKernelLaunch;
+
+    getContext()->logger.debug("==== urEnqueueKernelLaunch");
+
+    if (!isInstrumentedKernel(hKernel)) {
+        return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+    }
+
+    USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
+                             pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
+                             workDim);
+    UR_CALL(LaunchInfo.initialize());
+
+    UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+    ur_event_handle_t hEvent{};
+    ur_result_t result =
+        pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                        pGlobalWorkSize, LaunchInfo.LocalWorkSize.data(),
+                        numEventsInWaitList, phEventWaitList, &hEvent);
+
+    if (result == UR_RESULT_SUCCESS) {
+        UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue,
+                                                       LaunchInfo));
+    }
+
+    if (phEvent) {
+        *phEvent = hEvent;
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextCreate
+ur_result_t urContextCreate(
+    uint32_t numDevices, ///< [in] the number of devices given in phDevices
+    const ur_device_handle_t
+        *phDevices, ///< [in][range(0, numDevices)] array of handle of devices.
+    const ur_context_properties_t *
+        pProperties, ///< [in][optional] pointer to context creation properties.
+    ur_context_handle_t
+        *phContext ///< [out] pointer to handle of context object created
+) {
+    auto pfnCreate = getContext()->urDdiTable.Context.pfnCreate;
+
+    getContext()->logger.debug("==== urContextCreate");
+
+    ur_result_t result =
+        pfnCreate(numDevices, phDevices, pProperties, phContext);
+
+    if (result == UR_RESULT_SUCCESS) {
+        UR_CALL(setupContext(*phContext, numDevices, phDevices));
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextCreateWithNativeHandle
+ur_result_t urContextCreateWithNativeHandle(
+    ur_native_handle_t
+        hNativeContext, ///< [in][nocheck] the native handle of the getContext()->
+    ur_adapter_handle_t hAdapter,
+    uint32_t numDevices, ///< [in] number of devices associated with the context
+    const ur_device_handle_t *
+        phDevices, ///< [in][range(0, numDevices)] list of devices associated with the context
+    const ur_context_native_properties_t *
+        pProperties, ///< [in][optional] pointer to native context properties struct
+    ur_context_handle_t *
+        phContext ///< [out] pointer to the handle of the context object created.
+) {
+    auto pfnCreateWithNativeHandle =
+        getContext()->urDdiTable.Context.pfnCreateWithNativeHandle;
+
+    getContext()->logger.debug("==== urContextCreateWithNativeHandle");
+
+    ur_result_t result =
+        pfnCreateWithNativeHandle(hNativeContext, hAdapter, numDevices,
+                                  phDevices, pProperties, phContext);
+
+    if (result == UR_RESULT_SUCCESS) {
+        UR_CALL(setupContext(*phContext, numDevices, phDevices));
+    }
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextRetain
+ur_result_t urContextRetain(
+    ur_context_handle_t
+        hContext ///< [in] handle of the context to get a reference of.
+) {
+    auto pfnRetain = getContext()->urDdiTable.Context.pfnRetain;
+
+    getContext()->logger.debug("==== urContextRetain");
+
+    UR_CALL(pfnRetain(hContext));
+
+    auto ContextInfo = getMsanInterceptor()->getContextInfo(hContext);
+    UR_ASSERT(ContextInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
+    ContextInfo->RefCount++;
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urContextRelease
+ur_result_t urContextRelease(
+    ur_context_handle_t hContext ///< [in] handle of the context to release.
+) {
+    auto pfnRelease = getContext()->urDdiTable.Context.pfnRelease;
+
+    getContext()->logger.debug("==== urContextRelease");
+
+    UR_CALL(pfnRelease(hContext));
+
+    auto ContextInfo = getMsanInterceptor()->getContextInfo(hContext);
+    UR_ASSERT(ContextInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
+    if (--ContextInfo->RefCount == 0) {
+        UR_CALL(getMsanInterceptor()->eraseContext(hContext));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemBufferCreate
+ur_result_t urMemBufferCreate(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_mem_flags_t flags, ///< [in] allocation and usage information flags
+    size_t size, ///< [in] size in bytes of the memory object to be allocated
+    const ur_buffer_properties_t
+        *pProperties, ///< [in][optional] pointer to buffer creation properties
+    ur_mem_handle_t
+        *phBuffer ///< [out] pointer to handle of the memory buffer created
+) {
+    if (nullptr == phBuffer) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    getContext()->logger.debug("==== urMemBufferCreate");
+
+    void *Host = nullptr;
+    if (pProperties) {
+        Host = pProperties->pHost;
+    }
+
+    char *hostPtrOrNull = (flags & UR_MEM_FLAG_USE_HOST_POINTER)
+                              ? ur_cast<char *>(Host)
+                              : nullptr;
+
+    std::shared_ptr<MemBuffer> pMemBuffer =
+        std::make_shared<MemBuffer>(hContext, size, hostPtrOrNull);
+
+    if (Host && (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
+        std::shared_ptr<ContextInfo> CtxInfo =
+            getMsanInterceptor()->getContextInfo(hContext);
+        for (const auto &hDevice : CtxInfo->DeviceList) {
+            ManagedQueue InternalQueue(hContext, hDevice);
+            char *Handle = nullptr;
+            UR_CALL(pMemBuffer->getHandle(hDevice, Handle));
+            UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                InternalQueue, true, Handle, Host, size, 0, nullptr, nullptr));
+        }
+    }
+
+    ur_result_t result = getMsanInterceptor()->insertMemBuffer(pMemBuffer);
+    *phBuffer = ur_cast<ur_mem_handle_t>(pMemBuffer.get());
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemGetInfo
+ur_result_t urMemGetInfo(
+    ur_mem_handle_t
+        hMemory,            ///< [in] handle to the memory object being queried.
+    ur_mem_info_t propName, ///< [in] type of the info to retrieve.
+    size_t
+        propSize, ///< [in] the number of bytes of memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+                    ///< the info.
+    ///< If propSize is less than the real number of bytes needed to return
+    ///< the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+    ///< pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName.
+) {
+    auto pfnGetInfo = getContext()->urDdiTable.Mem.pfnGetInfo;
+
+    getContext()->logger.debug("==== urMemGetInfo");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hMemory)) {
+        UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+        switch (propName) {
+        case UR_MEM_INFO_CONTEXT: {
+            return ReturnValue(MemBuffer->Context);
+        }
+        case UR_MEM_INFO_SIZE: {
+            return ReturnValue(size_t{MemBuffer->Size});
+        }
+        default: {
+            return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+        }
+        }
+    } else {
+        UR_CALL(
+            pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemRetain
+ur_result_t urMemRetain(
+    ur_mem_handle_t hMem ///< [in] handle of the memory object to get access
+) {
+    auto pfnRetain = getContext()->urDdiTable.Mem.pfnRetain;
+
+    getContext()->logger.debug("==== urMemRetain");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hMem)) {
+        MemBuffer->RefCount++;
+    } else {
+        UR_CALL(pfnRetain(hMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemRelease
+ur_result_t urMemRelease(
+    ur_mem_handle_t hMem ///< [in] handle of the memory object to release
+) {
+    auto pfnRelease = getContext()->urDdiTable.Mem.pfnRelease;
+
+    getContext()->logger.debug("==== urMemRelease");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hMem)) {
+        if (--MemBuffer->RefCount != 0) {
+            return UR_RESULT_SUCCESS;
+        }
+        UR_CALL(MemBuffer->free());
+        UR_CALL(getMsanInterceptor()->eraseMemBuffer(hMem));
+    } else {
+        UR_CALL(pfnRelease(hMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemBufferPartition
+ur_result_t urMemBufferPartition(
+    ur_mem_handle_t
+        hBuffer,          ///< [in] handle of the buffer object to allocate from
+    ur_mem_flags_t flags, ///< [in] allocation and usage information flags
+    ur_buffer_create_type_t bufferCreateType, ///< [in] buffer creation type
+    const ur_buffer_region_t
+        *pRegion, ///< [in] pointer to buffer create region information
+    ur_mem_handle_t
+        *phMem ///< [out] pointer to the handle of sub buffer created
+) {
+    auto pfnBufferPartition = getContext()->urDdiTable.Mem.pfnBufferPartition;
+
+    getContext()->logger.debug("==== urMemBufferPartition");
+
+    if (auto ParentBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        if (ParentBuffer->Size < (pRegion->origin + pRegion->size)) {
+            return UR_RESULT_ERROR_INVALID_BUFFER_SIZE;
+        }
+        std::shared_ptr<MemBuffer> SubBuffer = std::make_shared<MemBuffer>(
+            ParentBuffer, pRegion->origin, pRegion->size);
+        UR_CALL(getMsanInterceptor()->insertMemBuffer(SubBuffer));
+        *phMem = reinterpret_cast<ur_mem_handle_t>(SubBuffer.get());
+    } else {
+        UR_CALL(pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion,
+                                   phMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemGetNativeHandle
+ur_result_t urMemGetNativeHandle(
+    ur_mem_handle_t hMem, ///< [in] handle of the mem.
+    ur_device_handle_t hDevice,
+    ur_native_handle_t
+        *phNativeMem ///< [out] a pointer to the native handle of the mem.
+) {
+    auto pfnGetNativeHandle = getContext()->urDdiTable.Mem.pfnGetNativeHandle;
+
+    getContext()->logger.debug("==== urMemGetNativeHandle");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hMem)) {
+        char *Handle = nullptr;
+        UR_CALL(MemBuffer->getHandle(hDevice, Handle));
+        *phNativeMem = ur_cast<ur_native_handle_t>(Handle);
+    } else {
+        UR_CALL(pfnGetNativeHandle(hMem, hDevice, phNativeMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferRead
+ur_result_t urEnqueueMemBufferRead(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being read
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferRead = getContext()->urDdiTable.Enqueue.pfnMemBufferRead;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferRead");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *pSrc = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, pSrc));
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingRead, pDst, pSrc + offset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size,
+                                 pDst, numEventsInWaitList, phEventWaitList,
+                                 phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferWrite
+ur_result_t urEnqueueMemBufferWrite(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being written
+    const void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferWrite = getContext()->urDdiTable.Enqueue.pfnMemBufferWrite;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferWrite");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *pDst = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, pDst));
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingWrite, pDst + offset, pSrc, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size,
+                                  pSrc, numEventsInWaitList, phEventWaitList,
+                                  phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferReadRect
+ur_result_t urEnqueueMemBufferReadRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOrigin,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t
+        bufferRowPitch, ///< [in] length of each row in bytes in the buffer object
+    size_t
+        bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being read
+    size_t
+        hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by
+                      ///< dst
+    size_t
+        hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region
+                        ///< pointed by dst
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferReadRect =
+        getContext()->urDdiTable.Enqueue.pfnMemBufferReadRect;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferReadRect");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        char *SrcHandle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, SrcHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, SrcHandle, ur_cast<char *>(pDst), bufferOrigin, hostOrigin,
+            region, bufferRowPitch, bufferSlicePitch, hostRowPitch,
+            hostSlicePitch, blockingRead, numEventsInWaitList, phEventWaitList,
+            phEvent));
+    } else {
+        UR_CALL(pfnMemBufferReadRect(
+            hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region,
+            bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch,
+            pDst, numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferWriteRect
+ur_result_t urEnqueueMemBufferWriteRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOrigin,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t
+        bufferRowPitch, ///< [in] length of each row in bytes in the buffer object
+    size_t
+        bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being
+                          ///< written
+    size_t
+        hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by
+                      ///< src
+    size_t
+        hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region
+                        ///< pointed by src
+    void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] points to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferWriteRect =
+        getContext()->urDdiTable.Enqueue.pfnMemBufferWriteRect;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferWriteRect");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        char *DstHandle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, ur_cast<char *>(pSrc), DstHandle, hostOrigin, bufferOrigin,
+            region, hostRowPitch, hostSlicePitch, bufferRowPitch,
+            bufferSlicePitch, blockingWrite, numEventsInWaitList,
+            phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferWriteRect(
+            hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region,
+            bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch,
+            pSrc, numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferCopy
+ur_result_t urEnqueueMemBufferCopy(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBufferSrc, ///< [in][bounds(srcOffset, size)] handle of the src buffer object
+    ur_mem_handle_t
+        hBufferDst, ///< [in][bounds(dstOffset, size)] handle of the dest buffer object
+    size_t srcOffset, ///< [in] offset into hBufferSrc to begin copying from
+    size_t dstOffset, ///< [in] offset info hBufferDst to begin copying into
+    size_t size,      ///< [in] size in bytes of data being copied
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferCopy = getContext()->urDdiTable.Enqueue.pfnMemBufferCopy;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferCopy");
+
+    auto SrcBuffer = getMsanInterceptor()->getMemBuffer(hBufferSrc);
+    auto DstBuffer = getMsanInterceptor()->getMemBuffer(hBufferDst);
+
+    UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer),
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    if (SrcBuffer && DstBuffer) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *SrcHandle = nullptr;
+        UR_CALL(SrcBuffer->getHandle(Device, SrcHandle));
+
+        char *DstHandle = nullptr;
+        UR_CALL(DstBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, false, DstHandle + dstOffset, SrcHandle + srcOffset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset,
+                                 dstOffset, size, numEventsInWaitList,
+                                 phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferCopyRect
+ur_result_t urEnqueueMemBufferCopyRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBufferSrc, ///< [in][bounds(srcOrigin, region)] handle of the source buffer object
+    ur_mem_handle_t
+        hBufferDst, ///< [in][bounds(dstOrigin, region)] handle of the dest buffer object
+    ur_rect_offset_t srcOrigin, ///< [in] 3D offset in the source buffer
+    ur_rect_offset_t dstOrigin, ///< [in] 3D offset in the destination buffer
+    ur_rect_region_t
+        region, ///< [in] source 3D rectangular region descriptor: width, height, depth
+    size_t
+        srcRowPitch, ///< [in] length of each row in bytes in the source buffer object
+    size_t
+        srcSlicePitch, ///< [in] length of each 2D slice in bytes in the source buffer object
+    size_t
+        dstRowPitch, ///< [in] length of each row in bytes in the destination buffer object
+    size_t
+        dstSlicePitch, ///< [in] length of each 2D slice in bytes in the destination buffer object
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferCopyRect =
+        getContext()->urDdiTable.Enqueue.pfnMemBufferCopyRect;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferCopyRect");
+
+    auto SrcBuffer = getMsanInterceptor()->getMemBuffer(hBufferSrc);
+    auto DstBuffer = getMsanInterceptor()->getMemBuffer(hBufferDst);
+
+    UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer),
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    if (SrcBuffer && DstBuffer) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *SrcHandle = nullptr;
+        UR_CALL(SrcBuffer->getHandle(Device, SrcHandle));
+
+        char *DstHandle = nullptr;
+        UR_CALL(DstBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, SrcHandle, DstHandle, srcOrigin, dstOrigin, region,
+            srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, false,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferCopyRect(
+            hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region,
+            srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferFill
+ur_result_t urEnqueueMemBufferFill(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    const void *pPattern, ///< [in] pointer to the fill pattern
+    size_t patternSize,   ///< [in] size in bytes of the pattern
+    size_t offset,        ///< [in] offset into the buffer
+    size_t size, ///< [in] fill size in bytes, must be a multiple of patternSize
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferFill = getContext()->urDdiTable.Enqueue.pfnMemBufferFill;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferFill");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+        char *Handle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, Handle));
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMFill(
+            hQueue, Handle + offset, patternSize, pPattern, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset,
+                                 size, numEventsInWaitList, phEventWaitList,
+                                 phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferMap
+ur_result_t urEnqueueMemBufferMap(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool blockingMap, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_map_flags_t mapFlags, ///< [in] flags for read, write, readwrite mapping
+    size_t offset, ///< [in] offset in bytes of the buffer region being mapped
+    size_t size,   ///< [in] size in bytes of the buffer region being mapped
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent, ///< [out][optional] return an event object that identifies this particular
+                 ///< command instance.
+    void **ppRetMap ///< [out] return mapped pointer.  TODO: move it before
+                    ///< numEventsInWaitList?
+) {
+    auto pfnMemBufferMap = getContext()->urDdiTable.Enqueue.pfnMemBufferMap;
+
+    getContext()->logger.debug("==== urEnqueueMemBufferMap");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hBuffer)) {
+
+        // Translate the host access mode info.
+        MemBuffer::AccessMode AccessMode = MemBuffer::UNKNOWN;
+        if (mapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION) {
+            AccessMode = MemBuffer::WRITE_ONLY;
+        } else {
+            if (mapFlags & UR_MAP_FLAG_READ) {
+                AccessMode = MemBuffer::READ_ONLY;
+                if (mapFlags & UR_MAP_FLAG_WRITE) {
+                    AccessMode = MemBuffer::READ_WRITE;
+                }
+            } else if (mapFlags & UR_MAP_FLAG_WRITE) {
+                AccessMode = MemBuffer::WRITE_ONLY;
+            }
+        }
+
+        UR_ASSERT(AccessMode != MemBuffer::UNKNOWN,
+                  UR_RESULT_ERROR_INVALID_ARGUMENT);
+
+        ur_device_handle_t Device = GetDevice(hQueue);
+        // If the buffer used host pointer, then we just reuse it. If not, we
+        // need to manually allocate a new host USM.
+        if (MemBuffer->HostPtr) {
+            *ppRetMap = MemBuffer->HostPtr + offset;
+        } else {
+            ur_context_handle_t Context = GetContext(hQueue);
+            ur_usm_desc_t USMDesc{};
+            USMDesc.align = MemBuffer->getAlignment();
+            ur_usm_pool_handle_t Pool{};
+            UR_CALL(getContext()->urDdiTable.USM.pfnHostAlloc(
+                Context, &USMDesc, Pool, size, ppRetMap));
+        }
+
+        // Actually, if the access mode is write only, we don't need to do this
+        // copy. However, in that way, we cannot generate a event to user. So,
+        // we'll aways do copy here.
+        char *SrcHandle = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, SrcHandle));
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingMap, *ppRetMap, SrcHandle + offset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+
+        {
+            std::scoped_lock<ur_shared_mutex> Guard(MemBuffer->Mutex);
+            UR_ASSERT(MemBuffer->Mappings.find(*ppRetMap) ==
+                          MemBuffer->Mappings.end(),
+                      UR_RESULT_ERROR_INVALID_VALUE);
+            MemBuffer->Mappings[*ppRetMap] = {offset, size};
+        }
+    } else {
+        UR_CALL(pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags, offset,
+                                size, numEventsInWaitList, phEventWaitList,
+                                phEvent, ppRetMap));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemUnmap
+ur_result_t urEnqueueMemUnmap(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hMem,         ///< [in] handle of the memory (buffer or image) object
+    void *pMappedPtr, ///< [in] mapped host address
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemUnmap = getContext()->urDdiTable.Enqueue.pfnMemUnmap;
+
+    getContext()->logger.debug("==== urEnqueueMemUnmap");
+
+    if (auto MemBuffer = getMsanInterceptor()->getMemBuffer(hMem)) {
+        MemBuffer::Mapping Mapping{};
+        {
+            std::scoped_lock<ur_shared_mutex> Guard(MemBuffer->Mutex);
+            auto It = MemBuffer->Mappings.find(pMappedPtr);
+            UR_ASSERT(It != MemBuffer->Mappings.end(),
+                      UR_RESULT_ERROR_INVALID_VALUE);
+            Mapping = It->second;
+            MemBuffer->Mappings.erase(It);
+        }
+
+        // Write back mapping memory data to device and release mapping memory
+        // if we allocated a host USM. But for now, UR doesn't support event
+        // call back, we can only do blocking copy here.
+        char *DstHandle = nullptr;
+        ur_context_handle_t Context = GetContext(hQueue);
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, DstHandle));
+        UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, true, DstHandle + Mapping.Offset, pMappedPtr, Mapping.Size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+
+        if (!MemBuffer->HostPtr) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(Context, pMappedPtr));
+        }
+    } else {
+        UR_CALL(pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList,
+                            phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelCreate
+ur_result_t urKernelCreate(
+    ur_program_handle_t hProgram, ///< [in] handle of the program instance
+    const char *pKernelName,      ///< [in] pointer to null-terminated string.
+    ur_kernel_handle_t
+        *phKernel ///< [out] pointer to handle of kernel object created.
+) {
+    auto pfnCreate = getContext()->urDdiTable.Kernel.pfnCreate;
+
+    getContext()->logger.debug("==== urKernelCreate");
+
+    UR_CALL(pfnCreate(hProgram, pKernelName, phKernel));
+    if (isInstrumentedKernel(*phKernel)) {
+        UR_CALL(getMsanInterceptor()->insertKernel(*phKernel));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelRetain
+ur_result_t urKernelRetain(
+    ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to retain
+) {
+    auto pfnRetain = getContext()->urDdiTable.Kernel.pfnRetain;
+
+    getContext()->logger.debug("==== urKernelRetain");
+
+    UR_CALL(pfnRetain(hKernel));
+
+    auto KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel);
+    if (KernelInfo) {
+        KernelInfo->RefCount++;
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelRelease
+ur_result_t urKernelRelease(
+    ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release
+) {
+    auto pfnRelease = getContext()->urDdiTable.Kernel.pfnRelease;
+
+    getContext()->logger.debug("==== urKernelRelease");
+    UR_CALL(pfnRelease(hKernel));
+
+    auto KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel);
+    if (KernelInfo) {
+        if (--KernelInfo->RefCount == 0) {
+            UR_CALL(getMsanInterceptor()->eraseKernel(hKernel));
+        }
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgValue
+ur_result_t urKernelSetArgValue(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t argSize,    ///< [in] size of argument type
+    const ur_kernel_arg_value_properties_t
+        *pProperties, ///< [in][optional] pointer to value properties.
+    const void
+        *pArgValue ///< [in] argument value represented as matching arg type.
+) {
+    auto pfnSetArgValue = getContext()->urDdiTable.Kernel.pfnSetArgValue;
+
+    getContext()->logger.debug("==== urKernelSetArgValue");
+
+    std::shared_ptr<MemBuffer> MemBuffer;
+    std::shared_ptr<KernelInfo> KernelInfo;
+    if (argSize == sizeof(ur_mem_handle_t) &&
+        (MemBuffer = getMsanInterceptor()->getMemBuffer(
+             *ur_cast<const ur_mem_handle_t *>(pArgValue))) &&
+        (KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel))) {
+        std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
+        KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
+    } else {
+        UR_CALL(
+            pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgMemObj
+ur_result_t urKernelSetArgMemObj(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    const ur_kernel_arg_mem_obj_properties_t
+        *pProperties, ///< [in][optional] pointer to Memory object properties.
+    ur_mem_handle_t hArgValue ///< [in][optional] handle of Memory object.
+) {
+    auto pfnSetArgMemObj = getContext()->urDdiTable.Kernel.pfnSetArgMemObj;
+
+    getContext()->logger.debug("==== urKernelSetArgMemObj");
+
+    std::shared_ptr<MemBuffer> MemBuffer;
+    std::shared_ptr<KernelInfo> KernelInfo;
+    if ((MemBuffer = getMsanInterceptor()->getMemBuffer(hArgValue)) &&
+        (KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel))) {
+        std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
+        KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
+    } else {
+        UR_CALL(pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Global table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetGlobalProcAddrTable(
+    ur_global_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnAdapterGet = ur_sanitizer_layer::msan::urAdapterGet;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Context table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetContextProcAddrTable(
+    ur_context_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::msan::urContextCreate;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::msan::urContextRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::msan::urContextRelease;
+
+    pDdiTable->pfnCreateWithNativeHandle =
+        ur_sanitizer_layer::msan::urContextCreateWithNativeHandle;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Program table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetProgramProcAddrTable(
+    ur_program_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    pDdiTable->pfnCreateWithIL =
+        ur_sanitizer_layer::msan::urProgramCreateWithIL;
+    pDdiTable->pfnCreateWithBinary =
+        ur_sanitizer_layer::msan::urProgramCreateWithBinary;
+    pDdiTable->pfnCreateWithNativeHandle =
+        ur_sanitizer_layer::msan::urProgramCreateWithNativeHandle;
+    pDdiTable->pfnBuild = ur_sanitizer_layer::msan::urProgramBuild;
+    pDdiTable->pfnLink = ur_sanitizer_layer::msan::urProgramLink;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::msan::urProgramRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::msan::urProgramRelease;
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Kernel table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetKernelProcAddrTable(
+    ur_kernel_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::msan::urKernelCreate;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::msan::urKernelRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::msan::urKernelRelease;
+    pDdiTable->pfnSetArgValue = ur_sanitizer_layer::msan::urKernelSetArgValue;
+    pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::msan::urKernelSetArgMemObj;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Mem table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetMemProcAddrTable(
+    ur_mem_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnBufferCreate = ur_sanitizer_layer::msan::urMemBufferCreate;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::msan::urMemRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::msan::urMemRelease;
+    pDdiTable->pfnBufferPartition =
+        ur_sanitizer_layer::msan::urMemBufferPartition;
+    pDdiTable->pfnGetNativeHandle =
+        ur_sanitizer_layer::msan::urMemGetNativeHandle;
+    pDdiTable->pfnGetInfo = ur_sanitizer_layer::msan::urMemGetInfo;
+
+    return result;
+}
+/// @brief Exported function for filling application's ProgramExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetProgramExpProcAddrTable(
+    ur_program_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnBuildExp = ur_sanitizer_layer::msan::urProgramBuildExp;
+    pDdiTable->pfnLinkExp = ur_sanitizer_layer::msan::urProgramLinkExp;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Enqueue table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetEnqueueProcAddrTable(
+    ur_enqueue_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnMemBufferRead =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferRead;
+    pDdiTable->pfnMemBufferWrite =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferWrite;
+    pDdiTable->pfnMemBufferReadRect =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferReadRect;
+    pDdiTable->pfnMemBufferWriteRect =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferWriteRect;
+    pDdiTable->pfnMemBufferCopy =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferCopy;
+    pDdiTable->pfnMemBufferCopyRect =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferCopyRect;
+    pDdiTable->pfnMemBufferFill =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferFill;
+    pDdiTable->pfnMemBufferMap =
+        ur_sanitizer_layer::msan::urEnqueueMemBufferMap;
+    pDdiTable->pfnMemUnmap = ur_sanitizer_layer::msan::urEnqueueMemUnmap;
+    pDdiTable->pfnKernelLaunch =
+        ur_sanitizer_layer::msan::urEnqueueKernelLaunch;
+
+    return result;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's USM table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetUSMProcAddrTable(
+    ur_usm_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnDeviceAlloc = ur_sanitizer_layer::msan::urUSMDeviceAlloc;
+
+    return result;
+}
+
+ur_result_t urCheckVersion(ur_api_version_t version) {
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+    return UR_RESULT_SUCCESS;
+}
+
+} // namespace msan
+
+ur_result_t initMsanDDITable(ur_dditable_t *dditable) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    getContext()->logger.always("==== DeviceSanitizer: MSAN");
+
+    if (UR_RESULT_SUCCESS == result) {
+        result =
+            ur_sanitizer_layer::msan::urCheckVersion(UR_API_VERSION_CURRENT);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetGlobalProcAddrTable(
+            &dditable->Global);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetContextProcAddrTable(
+            &dditable->Context);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetKernelProcAddrTable(
+            &dditable->Kernel);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetProgramProcAddrTable(
+            &dditable->Program);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetKernelProcAddrTable(
+            &dditable->Kernel);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result =
+            ur_sanitizer_layer::msan::urGetMemProcAddrTable(&dditable->Mem);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetProgramExpProcAddrTable(
+            &dditable->ProgramExp);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::msan::urGetEnqueueProcAddrTable(
+            &dditable->Enqueue);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result =
+            ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM);
+    }
+
+    if (result != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("Initialize MSAN DDI table failed: {}",
+                                   result);
+    }
+
+    return result;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.hpp b/source/loader/layers/sanitizer/msan/msan_ddi.hpp
new file mode 100644
index 0000000000..0e0bc84803
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_ddi.hpp
@@ -0,0 +1,22 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_ddi.hpp
+ *
+ */
+
+#include "ur_ddi.h"
+
+namespace ur_sanitizer_layer {
+
+void initMsanInterceptor();
+void destroyMsanInterceptor();
+
+ur_result_t initMsanDDITable(ur_dditable_t *dditable);
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
new file mode 100644
index 0000000000..30a2e07359
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
@@ -0,0 +1,490 @@
+//===----------------------------------------------------------------------===//
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_interceptor.cpp
+ *
+ */
+
+#include "msan_interceptor.hpp"
+#include "msan_ddi.hpp"
+#include "msan_options.hpp"
+#include "msan_report.hpp"
+#include "msan_shadow.hpp"
+#include "sanitizer_common/sanitizer_stacktrace.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+MsanInterceptor::MsanInterceptor() {}
+
+MsanInterceptor::~MsanInterceptor() {
+    // We must release these objects before releasing adapters, since
+    // they may use the adapter in their destructor
+    for (const auto &[_, DeviceInfo] : m_DeviceMap) {
+        DeviceInfo->Shadow->Destory();
+    }
+
+    m_MemBufferMap.clear();
+    m_AllocationMap.clear();
+    m_KernelMap.clear();
+    m_ContextMap.clear();
+
+    for (auto Adapter : m_Adapters) {
+        getContext()->urDdiTable.Global.pfnAdapterRelease(Adapter);
+    }
+}
+
+ur_result_t MsanInterceptor::allocateMemory(ur_context_handle_t Context,
+                                            ur_device_handle_t Device,
+                                            const ur_usm_desc_t *Properties,
+                                            ur_usm_pool_handle_t Pool,
+                                            size_t Size, void **ResultPtr) {
+
+    auto ContextInfo = getContextInfo(Context);
+    std::shared_ptr<DeviceInfo> DeviceInfo =
+        Device ? getDeviceInfo(Device) : nullptr;
+
+    void *Allocated = nullptr;
+
+    UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+        Context, Device, Properties, Pool, Size, &Allocated));
+
+    *ResultPtr = Allocated;
+
+    auto AI =
+        std::make_shared<MsanAllocInfo>(MsanAllocInfo{(uptr)Allocated,
+                                                      Size,
+                                                      false,
+                                                      Context,
+                                                      Device,
+                                                      GetCurrentBacktrace(),
+                                                      {}});
+
+    AI->print();
+
+    // For updating shadow memory
+    ContextInfo->insertAllocInfo({Device}, AI);
+
+    // For memory release
+    {
+        std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
+        m_AllocationMap.emplace(AI->AllocBegin, std::move(AI));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
+                                             ur_queue_handle_t Queue,
+                                             USMLaunchInfo &LaunchInfo) {
+    auto Context = GetContext(Queue);
+    auto Device = GetDevice(Queue);
+    auto ContextInfo = getContextInfo(Context);
+    auto DeviceInfo = getDeviceInfo(Device);
+
+    ManagedQueue InternalQueue(Context, Device);
+    if (!InternalQueue) {
+        getContext()->logger.error("Failed to create internal queue");
+        return UR_RESULT_ERROR_INVALID_QUEUE;
+    }
+
+    UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo));
+
+    UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
+                                              ur_queue_handle_t Queue,
+                                              USMLaunchInfo &LaunchInfo) {
+    // FIXME: We must use block operation here, until we support urEventSetCallback
+    auto Result = getContext()->urDdiTable.Queue.pfnFinish(Queue);
+
+    if (Result == UR_RESULT_SUCCESS) {
+        const auto &Report = LaunchInfo.Data->Report;
+
+        if (!Report.Flag) {
+            return Result;
+        }
+
+        ReportUsesUninitializedValue(LaunchInfo.Data->Report, Kernel);
+
+        exitWithErrors();
+    }
+
+    return Result;
+}
+
+ur_result_t
+MsanInterceptor::enqueueAllocInfo(std::shared_ptr<DeviceInfo> &DeviceInfo,
+                                  ur_queue_handle_t Queue,
+                                  std::shared_ptr<MsanAllocInfo> &AI) {
+    return DeviceInfo->Shadow->EnqueuePoisonShadow(Queue, AI->AllocBegin,
+                                                   AI->AllocSize, 0xff);
+}
+
+ur_result_t
+MsanInterceptor::updateShadowMemory(std::shared_ptr<ContextInfo> &ContextInfo,
+                                    std::shared_ptr<DeviceInfo> &DeviceInfo,
+                                    ur_queue_handle_t Queue) {
+    auto &AllocInfos = ContextInfo->AllocInfosMap[DeviceInfo->Handle];
+    std::scoped_lock<ur_shared_mutex> Guard(AllocInfos.Mutex);
+
+    for (auto &AI : AllocInfos.List) {
+        UR_CALL(enqueueAllocInfo(DeviceInfo, Queue, AI));
+    }
+    AllocInfos.List.clear();
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::registerProgram(ur_program_handle_t Program) {
+    ur_result_t Result = UR_RESULT_SUCCESS;
+
+    getContext()->logger.info("registerSpirKernels");
+    Result = registerSpirKernels(Program);
+    if (Result != UR_RESULT_SUCCESS) {
+        return Result;
+    }
+
+    return Result;
+}
+
+ur_result_t MsanInterceptor::unregisterProgram(ur_program_handle_t) {
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
+    auto Context = GetContext(Program);
+    std::vector<ur_device_handle_t> Devices = GetDevices(Program);
+
+    for (auto Device : Devices) {
+        size_t MetadataSize;
+        void *MetadataPtr;
+        ur_result_t Result =
+            getContext()->urDdiTable.Program.pfnGetGlobalVariablePointer(
+                Device, Program, kSPIR_MsanSpirKernelMetadata, &MetadataSize,
+                &MetadataPtr);
+        if (Result != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Can't get the pointer of <{}> under device {}: {}",
+                kSPIR_MsanSpirKernelMetadata, (void *)Device, Result);
+            return Result;
+        }
+
+        const uint64_t NumOfSpirKernel = MetadataSize / sizeof(SpirKernelInfo);
+        assert((MetadataSize % sizeof(SpirKernelInfo) == 0) &&
+               "SpirKernelMetadata size is not correct");
+
+        ManagedQueue Queue(Context, Device);
+
+        std::vector<SpirKernelInfo> SKInfo(NumOfSpirKernel);
+        Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+            Queue, true, &SKInfo[0], MetadataPtr,
+            sizeof(SpirKernelInfo) * NumOfSpirKernel, 0, nullptr, nullptr);
+        if (Result != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Can't read the value of <{}>: {}",
+                                       kSPIR_MsanSpirKernelMetadata, Result);
+            return Result;
+        }
+
+        auto PI = getProgramInfo(Program);
+        for (const auto &SKI : SKInfo) {
+            if (SKI.Size == 0) {
+                continue;
+            }
+            std::vector<char> KernelNameV(SKI.Size);
+            Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, KernelNameV.data(), (void *)SKI.KernelName,
+                sizeof(char) * SKI.Size, 0, nullptr, nullptr);
+            if (Result != UR_RESULT_SUCCESS) {
+                getContext()->logger.error("Can't read kernel name: {}",
+                                           Result);
+                return Result;
+            }
+
+            std::string KernelName =
+                std::string(KernelNameV.begin(), KernelNameV.end());
+
+            getContext()->logger.info(
+                "SpirKernel(name='{}', isInstrumented={})", KernelName, true);
+
+            PI->InstrumentedKernels.insert(KernelName);
+        }
+        getContext()->logger.info("Number of sanitized kernel: {}",
+                                  PI->InstrumentedKernels.size());
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::insertContext(ur_context_handle_t Context,
+                                           std::shared_ptr<ContextInfo> &CI) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+
+    if (m_ContextMap.find(Context) != m_ContextMap.end()) {
+        CI = m_ContextMap.at(Context);
+        return UR_RESULT_SUCCESS;
+    }
+
+    CI = std::make_shared<ContextInfo>(Context);
+
+    // Don't move CI, since it's a return value as well
+    m_ContextMap.emplace(Context, CI);
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::eraseContext(ur_context_handle_t Context) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+    assert(m_ContextMap.find(Context) != m_ContextMap.end());
+    m_ContextMap.erase(Context);
+    // TODO: Remove devices in each context
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::insertDevice(ur_device_handle_t Device,
+                                          std::shared_ptr<DeviceInfo> &DI) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
+
+    if (m_DeviceMap.find(Device) != m_DeviceMap.end()) {
+        DI = m_DeviceMap.at(Device);
+        return UR_RESULT_SUCCESS;
+    }
+
+    DI = std::make_shared<DeviceInfo>(Device);
+
+    DI->IsSupportSharedSystemUSM = GetDeviceUSMCapability(
+        Device, UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT);
+
+    // Query alignment
+    UR_CALL(getContext()->urDdiTable.Device.pfnGetInfo(
+        Device, UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN, sizeof(DI->Alignment),
+        &DI->Alignment, nullptr));
+
+    // Don't move DI, since it's a return value as well
+    m_DeviceMap.emplace(Device, DI);
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::eraseDevice(ur_device_handle_t Device) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
+    assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
+    m_DeviceMap.erase(Device);
+    // TODO: Remove devices in each context
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::insertProgram(ur_program_handle_t Program) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_ProgramMapMutex);
+    if (m_ProgramMap.find(Program) != m_ProgramMap.end()) {
+        return UR_RESULT_SUCCESS;
+    }
+    m_ProgramMap.emplace(Program, std::make_shared<ProgramInfo>(Program));
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::eraseProgram(ur_program_handle_t Program) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_ProgramMapMutex);
+    assert(m_ProgramMap.find(Program) != m_ProgramMap.end());
+    m_ProgramMap.erase(Program);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
+        return UR_RESULT_SUCCESS;
+    }
+    m_KernelMap.emplace(Kernel, std::make_shared<KernelInfo>(Kernel));
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::eraseKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+    m_KernelMap.erase(Kernel);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t
+MsanInterceptor::insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(ur_cast<ur_mem_handle_t>(MemBuffer.get())) ==
+           m_MemBufferMap.end());
+    m_MemBufferMap.emplace(reinterpret_cast<ur_mem_handle_t>(MemBuffer.get()),
+                           MemBuffer);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanInterceptor::eraseMemBuffer(ur_mem_handle_t MemHandle) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end());
+    m_MemBufferMap.erase(MemHandle);
+    return UR_RESULT_SUCCESS;
+}
+
+std::shared_ptr<MemBuffer>
+MsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) {
+    std::shared_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    if (m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end()) {
+        return m_MemBufferMap[MemHandle];
+    }
+    return nullptr;
+}
+
+ur_result_t MsanInterceptor::prepareLaunch(
+    std::shared_ptr<DeviceInfo> &DeviceInfo, ur_queue_handle_t Queue,
+    ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) {
+    auto Program = GetProgram(Kernel);
+
+    auto EnqueueWriteGlobal =
+        [&Queue, &Program](const char *Name, const void *Value, size_t Size) {
+            auto Result =
+                getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite(
+                    Queue, Program, Name, false, Size, 0, Value, 0, nullptr,
+                    nullptr);
+            if (Result != UR_RESULT_SUCCESS) {
+                getContext()->logger.error(
+                    "Failed to write device global \"{}\": {}", Name, Result);
+                return Result;
+            }
+            return UR_RESULT_SUCCESS;
+        };
+
+    // Set membuffer arguments
+    auto KernelInfo = getKernelInfo(Kernel);
+    assert(KernelInfo && "Kernel must be instrumented");
+
+    for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+        char *ArgPointer = nullptr;
+        UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgIndex, nullptr, ArgPointer);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                URes);
+        }
+    }
+
+    // Set LaunchInfo
+    LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
+    LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+    LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
+    LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
+
+    getContext()->logger.info(
+        "launch_info {} (GlobalShadow={}, Device={}, Debug={})",
+        (void *)LaunchInfo.Data, LaunchInfo.Data->GlobalShadowOffset,
+        ToString(LaunchInfo.Data->DeviceTy), LaunchInfo.Data->Debug);
+
+    UR_CALL(
+        EnqueueWriteGlobal("__MsanLaunchInfo", &LaunchInfo.Data, sizeof(uptr)));
+
+    return UR_RESULT_SUCCESS;
+}
+
+std::optional<MsanAllocationIterator>
+MsanInterceptor::findAllocInfoByAddress(uptr Address) {
+    std::shared_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
+    auto It = m_AllocationMap.upper_bound(Address);
+    if (It == m_AllocationMap.begin()) {
+        return std::optional<MsanAllocationIterator>{};
+    }
+    --It;
+    // Make sure we got the right MsanAllocInfo
+    assert(Address >= It->second->AllocBegin &&
+           Address < It->second->AllocBegin + It->second->AllocSize &&
+           "Wrong MsanAllocInfo for the address");
+    return It;
+}
+
+std::vector<MsanAllocationIterator>
+MsanInterceptor::findAllocInfoByContext(ur_context_handle_t Context) {
+    std::shared_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
+    std::vector<MsanAllocationIterator> AllocInfos;
+    for (auto It = m_AllocationMap.begin(); It != m_AllocationMap.end(); It++) {
+        const auto &[_, AI] = *It;
+        if (AI->Context == Context) {
+            AllocInfos.emplace_back(It);
+        }
+    }
+    return AllocInfos;
+}
+
+ur_result_t DeviceInfo::allocShadowMemory(ur_context_handle_t Context) {
+    Shadow = GetMsanShadowMemory(Context, Handle, Type);
+    assert(Shadow && "Failed to get shadow memory");
+    UR_CALL(Shadow->Setup());
+    getContext()->logger.info("ShadowMemory(Global): {} - {}",
+                              (void *)Shadow->ShadowBegin,
+                              (void *)Shadow->ShadowEnd);
+    return UR_RESULT_SUCCESS;
+}
+
+bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
+    const auto Name = GetKernelName(Kernel);
+    return InstrumentedKernels.find(Name) != InstrumentedKernels.end();
+}
+
+ContextInfo::~ContextInfo() {
+    [[maybe_unused]] auto Result =
+        getContext()->urDdiTable.Context.pfnRelease(Handle);
+    assert(Result == UR_RESULT_SUCCESS);
+}
+
+ur_result_t USMLaunchInfo::initialize() {
+    UR_CALL(getContext()->urDdiTable.Context.pfnRetain(Context));
+    UR_CALL(getContext()->urDdiTable.Device.pfnRetain(Device));
+    UR_CALL(getContext()->urDdiTable.USM.pfnSharedAlloc(
+        Context, Device, nullptr, nullptr, sizeof(MsanLaunchInfo),
+        (void **)&Data));
+    *Data = MsanLaunchInfo{};
+    return UR_RESULT_SUCCESS;
+}
+
+USMLaunchInfo::~USMLaunchInfo() {
+    [[maybe_unused]] ur_result_t Result;
+    if (Data) {
+        Result = getContext()->urDdiTable.USM.pfnFree(Context, (void *)Data);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+    Result = getContext()->urDdiTable.Context.pfnRelease(Context);
+    assert(Result == UR_RESULT_SUCCESS);
+    Result = getContext()->urDdiTable.Device.pfnRelease(Device);
+    assert(Result == UR_RESULT_SUCCESS);
+}
+
+} // namespace msan
+
+using namespace msan;
+
+static MsanInterceptor *interceptor;
+
+MsanInterceptor *getMsanInterceptor() { return interceptor; }
+
+void initMsanInterceptor() {
+    if (interceptor) {
+        return;
+    }
+    interceptor = new MsanInterceptor();
+}
+
+void destroyMsanInterceptor() {
+    delete interceptor;
+    interceptor = nullptr;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp
new file mode 100644
index 0000000000..80dbf389a4
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp
@@ -0,0 +1,323 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_interceptor.hpp
+ *
+ */
+
+#pragma once
+
+#include "msan_allocator.hpp"
+#include "msan_buffer.hpp"
+#include "msan_libdevice.hpp"
+#include "msan_options.hpp"
+#include "msan_shadow.hpp"
+#include "sanitizer_common/sanitizer_common.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+struct AllocInfoList {
+    std::vector<std::shared_ptr<MsanAllocInfo>> List;
+    ur_shared_mutex Mutex;
+};
+
+struct DeviceInfo {
+    ur_device_handle_t Handle;
+
+    DeviceType Type = DeviceType::UNKNOWN;
+    size_t Alignment = 0;
+    std::shared_ptr<MsanShadowMemory> Shadow;
+
+    // Device features
+    bool IsSupportSharedSystemUSM = false;
+
+    // Device handles are special and alive in the whole process lifetime,
+    // so we needn't retain&release here.
+    explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {}
+
+    ur_result_t allocShadowMemory(ur_context_handle_t Context);
+};
+
+struct QueueInfo {
+    ur_queue_handle_t Handle;
+
+    // lock this mutex if following fields are accessed
+    ur_shared_mutex Mutex;
+    ur_event_handle_t LastEvent;
+
+    explicit QueueInfo(ur_queue_handle_t Queue)
+        : Handle(Queue), LastEvent(nullptr) {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Queue.pfnRetain(Queue);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~QueueInfo() {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Queue.pfnRelease(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+};
+
+struct KernelInfo {
+    ur_kernel_handle_t Handle;
+    std::atomic<int32_t> RefCount = 1;
+
+    // lock this mutex if following fields are accessed
+    ur_shared_mutex Mutex;
+    std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
+
+    explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Kernel.pfnRetain(Kernel);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~KernelInfo() {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Kernel.pfnRelease(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+};
+
+struct ProgramInfo {
+    ur_program_handle_t Handle;
+    std::atomic<int32_t> RefCount = 1;
+
+    // Program is built only once, so we don't need to lock it
+    std::unordered_set<std::string> InstrumentedKernels;
+
+    explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Program.pfnRetain(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~ProgramInfo() {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Program.pfnRelease(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    bool isKernelInstrumented(ur_kernel_handle_t Kernel) const;
+};
+
+struct ContextInfo {
+    ur_context_handle_t Handle;
+    std::atomic<int32_t> RefCount = 1;
+
+    std::vector<ur_device_handle_t> DeviceList;
+    std::unordered_map<ur_device_handle_t, AllocInfoList> AllocInfosMap;
+
+    explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) {
+        [[maybe_unused]] auto Result =
+            getContext()->urDdiTable.Context.pfnRetain(Context);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~ContextInfo();
+
+    void insertAllocInfo(const std::vector<ur_device_handle_t> &Devices,
+                         std::shared_ptr<MsanAllocInfo> &AI) {
+        for (auto Device : Devices) {
+            auto &AllocInfos = AllocInfosMap[Device];
+            std::scoped_lock<ur_shared_mutex> Guard(AllocInfos.Mutex);
+            AllocInfos.List.emplace_back(AI);
+        }
+    }
+};
+
+struct USMLaunchInfo {
+    MsanLaunchInfo *Data = nullptr;
+
+    ur_context_handle_t Context = nullptr;
+    ur_device_handle_t Device = nullptr;
+    const size_t *GlobalWorkSize = nullptr;
+    const size_t *GlobalWorkOffset = nullptr;
+    std::vector<size_t> LocalWorkSize;
+    uint32_t WorkDim = 0;
+
+    USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
+                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                  const size_t *GlobalWorkOffset, uint32_t WorkDim)
+        : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize),
+          GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) {
+        if (LocalWorkSize) {
+            this->LocalWorkSize =
+                std::vector<size_t>(LocalWorkSize, LocalWorkSize + WorkDim);
+        }
+    }
+    ~USMLaunchInfo();
+
+    ur_result_t initialize();
+};
+
+struct SpirKernelInfo {
+    uptr KernelName;
+    uptr Size;
+};
+
+class MsanInterceptor {
+  public:
+    explicit MsanInterceptor();
+
+    ~MsanInterceptor();
+
+    ur_result_t allocateMemory(ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               const ur_usm_desc_t *Properties,
+                               ur_usm_pool_handle_t Pool, size_t Size,
+                               void **ResultPtr);
+
+    ur_result_t registerProgram(ur_program_handle_t Program);
+    ur_result_t unregisterProgram(ur_program_handle_t Program);
+
+    ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel,
+                                ur_queue_handle_t Queue,
+                                msan::USMLaunchInfo &LaunchInfo);
+    ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel,
+                                 ur_queue_handle_t Queue,
+                                 msan::USMLaunchInfo &LaunchInfo);
+
+    ur_result_t insertContext(ur_context_handle_t Context,
+                              std::shared_ptr<msan::ContextInfo> &CI);
+    ur_result_t eraseContext(ur_context_handle_t Context);
+
+    ur_result_t insertDevice(ur_device_handle_t Device,
+                             std::shared_ptr<msan::DeviceInfo> &CI);
+    ur_result_t eraseDevice(ur_device_handle_t Device);
+
+    ur_result_t insertProgram(ur_program_handle_t Program);
+    ur_result_t eraseProgram(ur_program_handle_t Program);
+
+    ur_result_t insertKernel(ur_kernel_handle_t Kernel);
+    ur_result_t eraseKernel(ur_kernel_handle_t Kernel);
+
+    ur_result_t insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer);
+    ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle);
+    std::shared_ptr<MemBuffer> getMemBuffer(ur_mem_handle_t MemHandle);
+
+    ur_result_t holdAdapter(ur_adapter_handle_t Adapter) {
+        std::scoped_lock<ur_shared_mutex> Guard(m_AdaptersMutex);
+        if (m_Adapters.find(Adapter) != m_Adapters.end()) {
+            return UR_RESULT_SUCCESS;
+        }
+        UR_CALL(getContext()->urDdiTable.Global.pfnAdapterRetain(Adapter));
+        m_Adapters.insert(Adapter);
+        return UR_RESULT_SUCCESS;
+    }
+
+    std::optional<MsanAllocationIterator> findAllocInfoByAddress(uptr Address);
+
+    std::vector<MsanAllocationIterator>
+    findAllocInfoByContext(ur_context_handle_t Context);
+
+    std::shared_ptr<msan::ContextInfo>
+    getContextInfo(ur_context_handle_t Context) {
+        std::shared_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
+        assert(m_ContextMap.find(Context) != m_ContextMap.end());
+        return m_ContextMap[Context];
+    }
+
+    std::shared_ptr<msan::DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
+        std::shared_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
+        assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
+        return m_DeviceMap[Device];
+    }
+
+    std::shared_ptr<msan::ProgramInfo>
+    getProgramInfo(ur_program_handle_t Program) {
+        std::shared_lock<ur_shared_mutex> Guard(m_ProgramMapMutex);
+        assert(m_ProgramMap.find(Program) != m_ProgramMap.end());
+        return m_ProgramMap[Program];
+    }
+
+    std::shared_ptr<msan::KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
+        std::shared_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+        if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
+            return m_KernelMap[Kernel];
+        }
+        return nullptr;
+    }
+
+    const MsanOptions &getOptions() { return m_Options; }
+
+    void exitWithErrors() {
+        m_NormalExit = false;
+        exit(1);
+    }
+
+    bool isNormalExit() { return m_NormalExit; }
+
+  private:
+    ur_result_t
+    updateShadowMemory(std::shared_ptr<msan::ContextInfo> &ContextInfo,
+                       std::shared_ptr<msan::DeviceInfo> &DeviceInfo,
+                       ur_queue_handle_t Queue);
+
+    ur_result_t enqueueAllocInfo(std::shared_ptr<msan::DeviceInfo> &DeviceInfo,
+                                 ur_queue_handle_t Queue,
+                                 std::shared_ptr<MsanAllocInfo> &AI);
+
+    /// Initialize Global Variables & Kernel Name at first Launch
+    ur_result_t prepareLaunch(std::shared_ptr<msan::DeviceInfo> &DeviceInfo,
+                              ur_queue_handle_t Queue,
+                              ur_kernel_handle_t Kernel,
+                              msan::USMLaunchInfo &LaunchInfo);
+
+    ur_result_t
+    allocShadowMemory(ur_context_handle_t Context,
+                      std::shared_ptr<msan::DeviceInfo> &DeviceInfo);
+
+    ur_result_t registerSpirKernels(ur_program_handle_t Program);
+
+  private:
+    std::unordered_map<ur_context_handle_t, std::shared_ptr<msan::ContextInfo>>
+        m_ContextMap;
+    ur_shared_mutex m_ContextMapMutex;
+    std::unordered_map<ur_device_handle_t, std::shared_ptr<msan::DeviceInfo>>
+        m_DeviceMap;
+    ur_shared_mutex m_DeviceMapMutex;
+
+    std::unordered_map<ur_program_handle_t, std::shared_ptr<msan::ProgramInfo>>
+        m_ProgramMap;
+    ur_shared_mutex m_ProgramMapMutex;
+
+    std::unordered_map<ur_kernel_handle_t, std::shared_ptr<msan::KernelInfo>>
+        m_KernelMap;
+    ur_shared_mutex m_KernelMapMutex;
+
+    std::unordered_map<ur_mem_handle_t, std::shared_ptr<MemBuffer>>
+        m_MemBufferMap;
+    ur_shared_mutex m_MemBufferMapMutex;
+
+    /// Assumption: all USM chunks are allocated in one VA
+    MsanAllocationMap m_AllocationMap;
+    ur_shared_mutex m_AllocationMapMutex;
+
+    MsanOptions m_Options;
+
+    std::unordered_set<ur_adapter_handle_t> m_Adapters;
+    ur_shared_mutex m_AdaptersMutex;
+
+    bool m_NormalExit = true;
+};
+
+} // namespace msan
+
+msan::MsanInterceptor *getMsanInterceptor();
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp
new file mode 100644
index 0000000000..cd05cfa38c
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp
@@ -0,0 +1,66 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_libdevice.hpp
+ *
+ */
+
+#pragma once
+
+#include "sanitizer_common/sanitizer_libdevice.hpp"
+
+#if !defined(__SPIR__) && !defined(__SPIRV__)
+namespace ur_sanitizer_layer {
+#endif // !__SPIR__ && !__SPIRV__
+
+struct MsanErrorReport {
+    int Flag = 0;
+
+    char File[256 + 1] = {};
+    char Func[256 + 1] = {};
+
+    int32_t Line = 0;
+
+    uint64_t GID0 = 0;
+    uint64_t GID1 = 0;
+    uint64_t GID2 = 0;
+
+    uint64_t LID0 = 0;
+    uint64_t LID1 = 0;
+    uint64_t LID2 = 0;
+
+    uint32_t AccessSize = 0;
+    ErrorType ErrorTy = ErrorType::UNKNOWN;
+};
+
+struct MsanLocalArgsInfo {
+    uint64_t Size = 0;
+    uint64_t SizeWithRedZone = 0;
+};
+
+struct MsanLaunchInfo {
+    uintptr_t GlobalShadowOffset = 0;
+    uintptr_t GlobalShadowOffsetEnd = 0;
+
+    DeviceType DeviceTy = DeviceType::UNKNOWN;
+    uint32_t Debug = 0;
+    uint32_t IsRecover = 0;
+
+    MsanErrorReport Report;
+};
+
+// Based on the observation, only the last 24 bits of the address of the private
+// variable have changed
+constexpr std::size_t MSAN_PRIVATE_SIZE = 0xffffffULL + 1;
+
+constexpr auto kSPIR_MsanDeviceGlobalMetadata = "__MsanDeviceGlobalMetadata";
+constexpr auto kSPIR_MsanSpirKernelMetadata = "__MsanKernelMetadata";
+
+#if !defined(__SPIR__) && !defined(__SPIRV__)
+} // namespace ur_sanitizer_layer
+#endif // !__SPIR__ && !__SPIRV__
diff --git a/source/loader/layers/sanitizer/msan/msan_options.cpp b/source/loader/layers/sanitizer/msan/msan_options.cpp
new file mode 100644
index 0000000000..b6ab9484da
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_options.cpp
@@ -0,0 +1,90 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_options.cpp
+ *
+ */
+
+#include "msan_options.hpp"
+
+#include "ur/ur.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <stdexcept>
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+MsanOptions::MsanOptions() {
+    std::optional<EnvVarMap> OptionsEnvMap;
+    try {
+        OptionsEnvMap = getenv_to_map("UR_LAYER_MSAN_OPTIONS");
+    } catch (const std::invalid_argument &e) {
+        std::stringstream SS;
+        SS << "<SANITIZER>[ERROR]: ";
+        SS << e.what();
+        getContext()->logger.always(SS.str().c_str());
+        die("Sanitizer failed to parse options.\n");
+    }
+
+    if (!OptionsEnvMap.has_value()) {
+        return;
+    }
+
+    const char *TrueStrings[] = {"1", "true"};
+    const char *FalseStrings[] = {"0", "false"};
+
+    auto InplaceToLower = [](std::string &S) {
+        std::transform(S.begin(), S.end(), S.begin(),
+                       [](unsigned char C) { return std::tolower(C); });
+    };
+    auto IsTrue = [&](const std::string &S) {
+        return std::any_of(std::begin(TrueStrings), std::end(TrueStrings),
+                           [&](const char *CS) { return S == CS; });
+    };
+    auto IsFalse = [&](const std::string &S) {
+        return std::any_of(std::begin(FalseStrings), std::end(FalseStrings),
+                           [&](const char *CS) { return S == CS; });
+    };
+
+    auto SetBoolOption = [&](const std::string &Name, bool &Opt) {
+        auto KV = OptionsEnvMap->find(Name);
+        if (KV != OptionsEnvMap->end()) {
+            auto Value = KV->second.front();
+            InplaceToLower(Value);
+            if (IsTrue(Value)) {
+                Opt = true;
+            } else if (IsFalse(Value)) {
+                Opt = false;
+            } else {
+                std::stringstream SS;
+                SS << "\"" << Name << "\" is set to \"" << Value
+                   << "\", which is not an valid setting. ";
+                SS << "Acceptable input are: for enable, use:";
+                for (auto &S : TrueStrings) {
+                    SS << " \"" << S << "\"";
+                }
+                SS << "; ";
+                SS << "for disable, use:";
+                for (auto &S : FalseStrings) {
+                    SS << " \"" << S << "\"";
+                }
+                SS << ".";
+                getContext()->logger.error(SS.str().c_str());
+                die("Sanitizer failed to parse options.\n");
+            }
+        }
+    };
+
+    SetBoolOption("debug", Debug);
+}
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_options.hpp b/source/loader/layers/sanitizer/msan/msan_options.hpp
new file mode 100644
index 0000000000..94b1e2c31e
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_options.hpp
@@ -0,0 +1,27 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_options.hpp
+ *
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+struct MsanOptions {
+    bool Debug = false;
+
+    explicit MsanOptions();
+};
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_report.cpp b/source/loader/layers/sanitizer/msan/msan_report.cpp
new file mode 100644
index 0000000000..c6f33a4c93
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_report.cpp
@@ -0,0 +1,43 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_report.cpp
+ *
+ */
+
+#include "msan_report.hpp"
+#include "msan_libdevice.hpp"
+
+#include "sanitizer_common/sanitizer_common.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+void ReportUsesUninitializedValue(const MsanErrorReport &Report,
+                                  ur_kernel_handle_t Kernel) {
+    const char *File = Report.File[0] ? Report.File : "<unknown file>";
+    const char *Func = Report.Func[0] ? Report.Func : "<unknown func>";
+    auto KernelName = GetKernelName(Kernel);
+
+    // Try to demangle the kernel name
+    KernelName = DemangleName(KernelName);
+
+    getContext()->logger.always(
+        "====WARNING: DeviceSanitizer: use-of-uninitialized-value");
+    getContext()->logger.always(
+        "use of size {} at kernel <{}> LID({}, {}, {}) GID({}, "
+        "{}, {})",
+        Report.AccessSize, KernelName.c_str(), Report.LID0, Report.LID1,
+        Report.LID2, Report.GID0, Report.GID1, Report.GID2);
+    getContext()->logger.always("  #0 {} {}:{}", Func, File, Report.Line);
+}
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_report.hpp b/source/loader/layers/sanitizer/msan/msan_report.hpp
new file mode 100644
index 0000000000..a8d7ac2324
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_report.hpp
@@ -0,0 +1,27 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_report.hpp
+ *
+ */
+
+#pragma once
+
+#include "ur_api.h"
+
+namespace ur_sanitizer_layer {
+
+struct MsanErrorReport;
+
+namespace msan {
+
+void ReportUsesUninitializedValue(const MsanErrorReport &Report,
+                                  ur_kernel_handle_t Kernel);
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
new file mode 100644
index 0000000000..add9813db6
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
@@ -0,0 +1,291 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_shadow.cpp
+ *
+ */
+
+#include "msan_shadow.hpp"
+#include "msan_interceptor.hpp"
+#include "sanitizer_common/sanitizer_utils.hpp"
+#include "ur_api.h"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+#define CPU_SHADOW1_BEGIN 0x010000000000ULL
+#define CPU_SHADOW1_END 0x100000000000ULL
+#define CPU_SHADOW2_BEGIN 0x200000000000ULL
+#define CPU_SHADOW2_END 0x300000000000ULL
+#define CPU_SHADOW3_BEGIN 0x500000000000ULL
+#define CPU_SHADOW3_END 0x510000000000ULL
+
+#define CPU_SHADOW_MASK 0x500000000000ULL
+
+std::shared_ptr<MsanShadowMemory>
+GetMsanShadowMemory(ur_context_handle_t Context, ur_device_handle_t Device,
+                    DeviceType Type) {
+    if (Type == DeviceType::CPU) {
+        static std::shared_ptr<MsanShadowMemory> ShadowCPU =
+            std::make_shared<MsanShadowMemoryCPU>(Context, Device);
+        return ShadowCPU;
+    } else if (Type == DeviceType::GPU_PVC) {
+        static std::shared_ptr<MsanShadowMemory> ShadowPVC =
+            std::make_shared<MsanShadowMemoryPVC>(Context, Device);
+        return ShadowPVC;
+    } else if (Type == DeviceType::GPU_DG2) {
+        static std::shared_ptr<MsanShadowMemory> ShadowDG2 =
+            std::make_shared<MsanShadowMemoryDG2>(Context, Device);
+        return ShadowDG2;
+    } else {
+        getContext()->logger.error("Unsupport device type");
+        return nullptr;
+    }
+}
+
+ur_result_t MsanShadowMemoryCPU::Setup() {
+    static ur_result_t Result = [this]() {
+        if (MmapFixedNoReserve(CPU_SHADOW1_BEGIN,
+                               CPU_SHADOW1_END - CPU_SHADOW1_BEGIN) == 0) {
+            return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        }
+        if (ProtectMemoryRange(CPU_SHADOW1_END,
+                               CPU_SHADOW2_BEGIN - CPU_SHADOW1_END) == 0) {
+            return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        }
+        if (MmapFixedNoReserve(CPU_SHADOW2_BEGIN,
+                               CPU_SHADOW2_END - CPU_SHADOW2_BEGIN) == 0) {
+            return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        }
+        if (ProtectMemoryRange(CPU_SHADOW2_END,
+                               CPU_SHADOW3_BEGIN - CPU_SHADOW2_END) == 0) {
+            return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        }
+        if (MmapFixedNoReserve(CPU_SHADOW3_BEGIN,
+                               CPU_SHADOW3_END - CPU_SHADOW3_BEGIN) == 0) {
+            return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        }
+        ShadowBegin = CPU_SHADOW1_BEGIN;
+        ShadowEnd = CPU_SHADOW3_END;
+        DontCoredumpRange(ShadowBegin, ShadowEnd - ShadowBegin);
+        return UR_RESULT_SUCCESS;
+    }();
+    return Result;
+}
+
+ur_result_t MsanShadowMemoryCPU::Destory() {
+    if (ShadowBegin == 0 && ShadowEnd == 0) {
+        return UR_RESULT_SUCCESS;
+    }
+    static ur_result_t Result = [this]() {
+        if (!Munmap(CPU_SHADOW1_BEGIN, CPU_SHADOW1_END - CPU_SHADOW1_BEGIN)) {
+            return UR_RESULT_ERROR_UNKNOWN;
+        }
+        if (!Munmap(CPU_SHADOW1_END, CPU_SHADOW2_BEGIN - CPU_SHADOW1_END)) {
+            return UR_RESULT_ERROR_UNKNOWN;
+        }
+        if (!Munmap(CPU_SHADOW2_BEGIN, CPU_SHADOW2_END - CPU_SHADOW2_BEGIN) ==
+            0) {
+            return UR_RESULT_ERROR_UNKNOWN;
+        }
+        if (!Munmap(CPU_SHADOW2_END, CPU_SHADOW3_BEGIN - CPU_SHADOW2_END)) {
+            return UR_RESULT_ERROR_UNKNOWN;
+        }
+        if (!Munmap(CPU_SHADOW3_BEGIN, CPU_SHADOW3_END - CPU_SHADOW3_BEGIN) ==
+            0) {
+            return UR_RESULT_ERROR_UNKNOWN;
+        }
+        ShadowBegin = ShadowEnd = 0;
+        return UR_RESULT_SUCCESS;
+    }();
+    return Result;
+}
+
+uptr MsanShadowMemoryCPU::MemToShadow(uptr Ptr) {
+    return Ptr ^ CPU_SHADOW_MASK;
+}
+
+ur_result_t MsanShadowMemoryCPU::EnqueuePoisonShadow(ur_queue_handle_t,
+                                                     uptr Ptr, uptr Size,
+                                                     u8 Value) {
+    if (Size == 0) {
+        return UR_RESULT_SUCCESS;
+    }
+
+    uptr ShadowBegin = MemToShadow(Ptr);
+    uptr ShadowEnd = MemToShadow(Ptr + Size - 1);
+    assert(ShadowBegin <= ShadowEnd);
+    getContext()->logger.debug(
+        "EnqueuePoisonShadow(addr={}, count={}, value={})", (void *)ShadowBegin,
+        ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value);
+    memset((void *)ShadowBegin, Value, ShadowEnd - ShadowBegin + 1);
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MsanShadowMemoryGPU::Setup() {
+    // Currently, Level-Zero doesn't create independent VAs for each contexts, if we reserve
+    // shadow memory for each contexts, this will cause out-of-resource error when user uses
+    // multiple contexts. Therefore, we just create one shadow memory here.
+    static ur_result_t Result = [this]() {
+        size_t ShadowSize = GetShadowSize();
+        // TODO: Protect Bad Zone
+        auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
+            Context, nullptr, ShadowSize, (void **)&ShadowBegin);
+        if (Result == UR_RESULT_SUCCESS) {
+            ShadowEnd = ShadowBegin + ShadowSize;
+            // Retain the context which reserves shadow memory
+            getContext()->urDdiTable.Context.pfnRetain(Context);
+        }
+
+        // Set shadow memory for null pointer
+        ManagedQueue Queue(Context, Device);
+        return UR_RESULT_SUCCESS;
+    }();
+    return Result;
+}
+
+ur_result_t MsanShadowMemoryGPU::Destory() {
+    if (ShadowBegin == 0) {
+        return UR_RESULT_SUCCESS;
+    }
+    static ur_result_t Result = [this]() {
+        auto Result = getContext()->urDdiTable.VirtualMem.pfnFree(
+            Context, (const void *)ShadowBegin, GetShadowSize());
+        getContext()->urDdiTable.Context.pfnRelease(Context);
+        return Result;
+    }();
+    return Result;
+}
+
+ur_result_t MsanShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
+                                                     uptr Ptr, uptr Size,
+                                                     u8 Value) {
+    if (Size == 0) {
+        return UR_RESULT_SUCCESS;
+    }
+
+    uptr ShadowBegin = MemToShadow(Ptr);
+    uptr ShadowEnd = MemToShadow(Ptr + Size - 1);
+    assert(ShadowBegin <= ShadowEnd);
+    {
+        static const size_t PageSize =
+            GetVirtualMemGranularity(Context, Device);
+
+        ur_physical_mem_properties_t Desc{
+            UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES, nullptr, 0};
+
+        // Make sure [Ptr, Ptr + Size] is mapped to physical memory
+        for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
+             MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
+            std::scoped_lock<ur_mutex> Guard(VirtualMemMapsMutex);
+            if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) {
+                ur_physical_mem_handle_t PhysicalMem{};
+                auto URes = getContext()->urDdiTable.PhysicalMem.pfnCreate(
+                    Context, Device, PageSize, &Desc, &PhysicalMem);
+                if (URes != UR_RESULT_SUCCESS) {
+                    getContext()->logger.error("urPhysicalMemCreate(): {}",
+                                               URes);
+                    return URes;
+                }
+
+                URes = getContext()->urDdiTable.VirtualMem.pfnMap(
+                    Context, (void *)MappedPtr, PageSize, PhysicalMem, 0,
+                    UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE);
+                if (URes != UR_RESULT_SUCCESS) {
+                    getContext()->logger.error("urVirtualMemMap({}, {}): {}",
+                                               (void *)MappedPtr, PageSize,
+                                               URes);
+                    return URes;
+                }
+
+                getContext()->logger.debug("urVirtualMemMap: {} ~ {}",
+                                           (void *)MappedPtr,
+                                           (void *)(MappedPtr + PageSize - 1));
+
+                // Initialize to zero
+                URes = EnqueueUSMBlockingSet(Queue, (void *)MappedPtr, 0,
+                                             PageSize);
+                if (URes != UR_RESULT_SUCCESS) {
+                    getContext()->logger.error("EnqueueUSMBlockingSet(): {}",
+                                               URes);
+                    return URes;
+                }
+
+                VirtualMemMaps[MappedPtr].first = PhysicalMem;
+            }
+
+            // We don't need to record virtual memory map for null pointer,
+            // since it doesn't have an alloc info.
+            if (Ptr == 0) {
+                continue;
+            }
+
+            auto AllocInfoIt =
+                getMsanInterceptor()->findAllocInfoByAddress(Ptr);
+            assert(AllocInfoIt);
+            VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second);
+        }
+    }
+
+    auto URes = EnqueueUSMBlockingSet(Queue, (void *)ShadowBegin, Value,
+                                      ShadowEnd - ShadowBegin + 1);
+    getContext()->logger.debug(
+        "EnqueuePoisonShadow (addr={}, count={}, value={}): {}",
+        (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value,
+        URes);
+    if (URes != UR_RESULT_SUCCESS) {
+        getContext()->logger.error("EnqueueUSMBlockingSet(): {}", URes);
+        return URes;
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t
+MsanShadowMemoryGPU::ReleaseShadow(std::shared_ptr<MsanAllocInfo> AI) {
+    uptr ShadowBegin = MemToShadow(AI->AllocBegin);
+    uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize);
+    assert(ShadowBegin <= ShadowEnd);
+
+    static const size_t PageSize = GetVirtualMemGranularity(Context, Device);
+
+    for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
+         MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
+        std::scoped_lock<ur_mutex> Guard(VirtualMemMapsMutex);
+        if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) {
+            continue;
+        }
+        VirtualMemMaps[MappedPtr].second.erase(AI);
+        if (VirtualMemMaps[MappedPtr].second.empty()) {
+            UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
+                Context, (void *)MappedPtr, PageSize));
+            UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease(
+                VirtualMemMaps[MappedPtr].first));
+            getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
+                                       (void *)MappedPtr,
+                                       (void *)(MappedPtr + PageSize - 1));
+        }
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+uptr MsanShadowMemoryPVC::MemToShadow(uptr Ptr) {
+    assert(Ptr & 0xFF00000000000000ULL && "Ptr must be device USM");
+    return ShadowBegin + (Ptr & 0x3FFF'FFFF'FFFFULL);
+}
+
+uptr MsanShadowMemoryDG2::MemToShadow(uptr Ptr) {
+    assert(Ptr & 0xFFFF000000000000ULL && "Ptr must be device USM");
+    return ShadowBegin + (Ptr & 0x3FFF'FFFF'FFFFULL);
+}
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
new file mode 100644
index 0000000000..de13683cbc
--- /dev/null
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
@@ -0,0 +1,144 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file msan_shadow.hpp
+ *
+ */
+
+#pragma once
+
+#include "msan_allocator.hpp"
+#include "sanitizer_common/sanitizer_libdevice.hpp"
+
+#include <unordered_set>
+
+namespace ur_sanitizer_layer {
+namespace msan {
+
+struct MsanShadowMemory {
+    MsanShadowMemory(ur_context_handle_t Context, ur_device_handle_t Device)
+        : Context(Context), Device(Device) {}
+
+    virtual ~MsanShadowMemory() {}
+
+    virtual ur_result_t Setup() = 0;
+
+    virtual ur_result_t Destory() = 0;
+
+    virtual uptr MemToShadow(uptr Ptr) = 0;
+
+    virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
+                                            uptr Size, u8 Value) = 0;
+
+    virtual ur_result_t ReleaseShadow(std::shared_ptr<MsanAllocInfo>) {
+        return UR_RESULT_SUCCESS;
+    }
+
+    ur_context_handle_t Context{};
+
+    ur_device_handle_t Device{};
+
+    uptr ShadowBegin = 0;
+
+    uptr ShadowEnd = 0;
+};
+
+/// Shadow Memory layout of CPU device
+///
+/// 0x000000000000 ~ 0x010000000000 "app-1"
+/// 0x010000000000 ~ 0x100000000000 "shadow-2"
+/// 0x100000000000 ~ 0x110000000000 "invalid"
+/// 0x110000000000 ~ 0x200000000000 "origin-2"
+/// 0x200000000000 ~ 0x300000000000 "shadow-3"
+/// 0x300000000000 ~ 0x400000000000 "origin-3"
+/// 0x400000000000 ~ 0x500000000000 "invalid"
+/// 0x500000000000 ~ 0x510000000000 "shadow-1"
+/// 0x510000000000 ~ 0x600000000000 "app-2"
+/// 0x600000000000 ~ 0x610000000000 "origin-1"
+/// 0x610000000000 ~ 0x700000000000 "invalid"
+/// 0x700000000000 ~ 0x740000000000 "allocator"
+/// 0x740000000000 ~ 0x800000000000 "app-3"
+///
+struct MsanShadowMemoryCPU final : public MsanShadowMemory {
+    MsanShadowMemoryCPU(ur_context_handle_t Context, ur_device_handle_t Device)
+        : MsanShadowMemory(Context, Device) {}
+
+    ur_result_t Setup() override;
+
+    ur_result_t Destory() override;
+
+    uptr MemToShadow(uptr Ptr) override;
+
+    ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
+                                    uptr Size, u8 Value) override;
+};
+
+struct MsanShadowMemoryGPU : public MsanShadowMemory {
+    MsanShadowMemoryGPU(ur_context_handle_t Context, ur_device_handle_t Device)
+        : MsanShadowMemory(Context, Device) {}
+
+    ur_result_t Setup() override;
+
+    ur_result_t Destory() override;
+    ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
+                                    uptr Size, u8 Value) override final;
+
+    ur_result_t ReleaseShadow(std::shared_ptr<MsanAllocInfo> AI) override final;
+
+    virtual size_t GetShadowSize() = 0;
+
+    ur_mutex VirtualMemMapsMutex;
+
+    std::unordered_map<
+        uptr, std::pair<ur_physical_mem_handle_t,
+                        std::unordered_set<std::shared_ptr<MsanAllocInfo>>>>
+        VirtualMemMaps;
+};
+
+/// Shadow Memory layout of GPU PVC device
+///
+/// USM Allocation Range (56 bits)
+///   Host   USM : 0x0000_0000_0000_0000 ~ 0x00ff_ffff_ffff_ffff
+///   Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff
+///   Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff
+///
+/// USM Allocation Range (AllocateHostAllocationsInHeapExtendedHost=0)
+///   Host   USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff
+///   Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff
+///   Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff
+///
+struct MsanShadowMemoryPVC final : public MsanShadowMemoryGPU {
+    MsanShadowMemoryPVC(ur_context_handle_t Context, ur_device_handle_t Device)
+        : MsanShadowMemoryGPU(Context, Device) {}
+
+    uptr MemToShadow(uptr Ptr) override;
+
+    size_t GetShadowSize() override { return 0x8000'0000'0000ULL; }
+};
+
+/// Shadow Memory layout of GPU DG2 device
+///
+/// USM Allocation Range (48 bits)
+///   Host/Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff
+///   Device      USM : 0xffff_8000_0000_0000 ~ 0xffff_ffff_ffff_ffff
+///
+struct MsanShadowMemoryDG2 final : public MsanShadowMemoryGPU {
+    MsanShadowMemoryDG2(ur_context_handle_t Context, ur_device_handle_t Device)
+        : MsanShadowMemoryGPU(Context, Device) {}
+
+    uptr MemToShadow(uptr Ptr) override;
+
+    size_t GetShadowSize() override { return 0x4000'0000'0000ULL; }
+};
+
+std::shared_ptr<MsanShadowMemory>
+GetMsanShadowMemory(ur_context_handle_t Context, ur_device_handle_t Device,
+                    DeviceType Type);
+
+} // namespace msan
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp
index 380482ff84..df64a72ed7 100644
--- a/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp
+++ b/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp
@@ -26,6 +26,15 @@ namespace ur_sanitizer_layer {
 
 bool IsInASanContext() { return (void *)__asan_init != nullptr; }
 
+uptr MmapFixedNoReserve(uptr Addr, uptr Size) {
+    Size = RoundUpTo(Size, EXEC_PAGESIZE);
+    Addr = RoundDownTo(Addr, EXEC_PAGESIZE);
+    void *P =
+        mmap((void *)Addr, Size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE | MAP_ANONYMOUS, -1, 0);
+    return (uptr)P;
+}
+
 uptr MmapNoReserve(uptr Addr, uptr Size) {
     Size = RoundUpTo(Size, EXEC_PAGESIZE);
     Addr = RoundDownTo(Addr, EXEC_PAGESIZE);
@@ -36,6 +45,15 @@ uptr MmapNoReserve(uptr Addr, uptr Size) {
 
 bool Munmap(uptr Addr, uptr Size) { return munmap((void *)Addr, Size) == 0; }
 
+uptr ProtectMemoryRange(uptr Addr, uptr Size) {
+    Size = RoundUpTo(Size, EXEC_PAGESIZE);
+    Addr = RoundDownTo(Addr, EXEC_PAGESIZE);
+    void *P =
+        mmap((void *)Addr, Size, PROT_NONE,
+             MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE | MAP_ANONYMOUS, -1, 0);
+    return (uptr)P;
+}
+
 bool DontCoredumpRange(uptr Addr, uptr Size) {
     Size = RoundUpTo(Size, EXEC_PAGESIZE);
     Addr = RoundDownTo(Addr, EXEC_PAGESIZE);
diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_common.hpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_common.hpp
index 147bd23be3..c23dc15f7e 100644
--- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_common.hpp
+++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_common.hpp
@@ -139,8 +139,10 @@ struct SourceInfo {
 
 bool IsInASanContext();
 
+uptr MmapFixedNoReserve(uptr Addr, uptr Size);
 uptr MmapNoReserve(uptr Addr, uptr Size);
 bool Munmap(uptr Addr, uptr Size);
+uptr ProtectMemoryRange(uptr Addr, uptr Size);
 bool DontCoredumpRange(uptr Addr, uptr Size);
 
 void *GetMemFunctionPointer(const char *);
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
new file mode 100644
index 0000000000..59764645f9
--- /dev/null
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -0,0 +1,54 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ur_sanddi.cpp
+ *
+ */
+
+#include "asan/asan_ddi.hpp"
+#include "msan/msan_ddi.hpp"
+#include "ur_sanitizer_layer.hpp"
+
+namespace ur_sanitizer_layer {
+
+ur_result_t context_t::init(ur_dditable_t *dditable,
+                            const std::set<std::string> &enabledLayerNames,
+                            [[maybe_unused]] codeloc_data codelocData) {
+    bool asanEnabled = enabledLayerNames.count("UR_LAYER_ASAN");
+    bool msanEnabled = enabledLayerNames.count("UR_LAYER_MSAN");
+
+    if (asanEnabled && msanEnabled) {
+        getContext()->logger.warning(
+            "Enabling ASAN and MSAN at the same time is not "
+            "supported.");
+        return UR_RESULT_SUCCESS;
+    } else if (asanEnabled) {
+        enabledType = SanitizerType::AddressSanitizer;
+    } else if (msanEnabled) {
+        enabledType = SanitizerType::MemorySanitizer;
+    } else {
+        return UR_RESULT_SUCCESS;
+    }
+
+    urDdiTable = *dditable;
+
+    switch (enabledType) {
+    case SanitizerType::AddressSanitizer:
+        initAsanInterceptor();
+        return initAsanDDITable(dditable);
+    case SanitizerType::MemorySanitizer:
+        initMsanInterceptor();
+        return initMsanDDITable(dditable);
+    default:
+        break;
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
index d1e00c640c..d0a172f5e5 100644
--- a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
+++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
@@ -12,6 +12,7 @@
 
 #include "ur_sanitizer_layer.hpp"
 #include "asan/asan_ddi.hpp"
+#include "msan/msan_ddi.hpp"
 
 namespace ur_sanitizer_layer {
 context_t *getContext() { return context_t::get_direct(); }
@@ -26,6 +27,9 @@ ur_result_t context_t::tearDown() {
     case SanitizerType::AddressSanitizer:
         destroyAsanInterceptor();
         break;
+    case SanitizerType::MemorySanitizer:
+        destroyMsanInterceptor();
+        break;
     default:
         break;
     }

From bd242546840aa08f9fe8575daf14cc9bbe4a978f Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 27 Nov 2024 20:23:20 +0000
Subject: [PATCH 103/148] [L0 v2] fix enqueueEventsWaitWithBarrier

Use actual barrier when profiling is enabled to
ensure we get proper profiling info.
---
 .../v2/queue_immediate_in_order.cpp           | 36 +++++++++++++-
 .../v2/queue_immediate_in_order.hpp           |  5 ++
 .../event/urEventGetProfilingInfo.cpp         | 48 +++++++++++++++++++
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 05e48c8740..5c3b585611 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -282,14 +282,46 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait(
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl(
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY(
+      "ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier");
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  if (!numEventsInWaitList && !phEvent) {
+    // nop
+    return UR_RESULT_SUCCESS;
+  }
+
+  auto signalEvent =
+      getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER);
+  auto [pWaitEvents, numWaitEvents] =
+      getWaitListView(phEventWaitList, numEventsInWaitList);
+
+  ZE2UR_CALL(zeCommandListAppendBarrier,
+             (handler.commandList.get(), signalEvent->getZeEvent(),
+              numWaitEvents, pWaitEvents));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier(
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   // For in-order queue we don't need a real barrier, just wait for
   // requested events in potentially different queues and add a "barrier"
   // event signal because it is already guaranteed that previous commands
-  // in this queue are completed when the signal is started.
-  return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent);
+  // in this queue are completed when the signal is started. However, we do
+  // need to use barrier if profiling is enabled: see
+  // zeCommandListAppendWaitOnEvents
+  if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) {
+    return enqueueEventsWaitWithBarrierImpl(numEventsInWaitList,
+                                            phEventWaitList, phEvent);
+  } else {
+    return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent);
+  }
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierExt(
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index bdd3009d63..03fdbe0075 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -77,6 +77,11 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
       const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
       ur_command_t commandType);
 
+  ur_result_t
+  enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList,
+                                   const ur_event_handle_t *phEventWaitList,
+                                   ur_event_handle_t *phEvent);
+
 public:
   ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
                                 const ur_queue_properties_t *);
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index 7b91679dad..6289de7b9e 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -121,3 +121,51 @@ TEST_P(urEventGetProfilingInfoNegativeTest, InvalidValue) {
 }
 
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEventGetProfilingInfoNegativeTest);
+
+struct urEventGetProfilingInfoForWaitWithBarrier : uur::urProfilingQueueTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urProfilingQueueTest::SetUp());
+        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
+                                         nullptr, &buffer));
+
+        input.assign(count, 42);
+        ur_event_handle_t membuf_event = nullptr;
+        ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue, buffer, false, 0, size,
+                                               input.data(), 0, nullptr,
+                                               &membuf_event));
+
+        ASSERT_SUCCESS(
+            urEnqueueEventsWaitWithBarrier(queue, 1, &membuf_event, &event));
+        ASSERT_SUCCESS(urQueueFinish(queue));
+    }
+
+    void TearDown() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urProfilingQueueTest::TearDown());
+    }
+
+    const size_t count = 1024;
+    const size_t size = sizeof(uint32_t) * count;
+    ur_mem_handle_t buffer = nullptr;
+    ur_event_handle_t event = nullptr;
+    std::vector<uint32_t> input;
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEventGetProfilingInfoForWaitWithBarrier);
+
+TEST_P(urEventGetProfilingInfoForWaitWithBarrier, Success) {
+    std::vector<uint8_t> submit_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event,
+                                           UR_PROFILING_INFO_COMMAND_START,
+                                           size, submit_data.data(), nullptr));
+    auto start_timing = reinterpret_cast<size_t *>(submit_data.data());
+    ASSERT_NE(*start_timing, 0);
+
+    std::vector<uint8_t> complete_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
+                                           size, complete_data.data(),
+                                           nullptr));
+    auto end_timing = reinterpret_cast<size_t *>(complete_data.data());
+    ASSERT_NE(*end_timing, 0);
+
+    ASSERT_GT(*end_timing, *start_timing);
+}

From fae0932ad081fa563007bb2f4073f96a4b06c7f7 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 27 Nov 2024 20:30:04 +0000
Subject: [PATCH 104/148] [L0 v2] use waitOnEvents instead of barrier

when appropriate. This is the same behavior as in legacy
adapter.
---
 .../adapters/level_zero/v2/queue_immediate_in_order.cpp   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 5c3b585611..318d9a8b7a 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -788,8 +788,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
       getWaitListView(phEventWaitList, numEventsInWaitList);
 
   if (pWaitEvents) {
-    ZE2UR_CALL(zeCommandListAppendBarrier, (handler.commandList.get(), nullptr,
-                                            numWaitEvents, pWaitEvents));
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (handler.commandList.get(), numWaitEvents, pWaitEvents));
   }
   // TODO: figure out how to translate "flags"
   ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
@@ -820,8 +820,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
   auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
 
   if (pWaitEvents) {
-    ZE2UR_CALL(zeCommandListAppendBarrier, (handler.commandList.get(), nullptr,
-                                            numWaitEvents, pWaitEvents));
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (handler.commandList.get(), numWaitEvents, pWaitEvents));
   }
 
   // TODO: figure out how to translate "flags"

From 0185baf440f6402ef7cc2de464190c16b48dd4ae Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 11 Dec 2024 09:23:07 -0800
Subject: [PATCH 105/148] [L0] Fix Device Info Reporting for vector width to
 match spec

- Given device has a lack of fp64 or fp16 support, return 0 as the
  vector width.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/device.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index b7422fe2cc..6705c4c659 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -654,9 +654,15 @@ ur_result_t urDeviceGetInfo(
     return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
+    // Must return 0 for *vector_width_double* if the device does not have fp64.
+    if (!(Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64))
+      return ReturnValue(uint32_t{0});
     return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
+    // Must return 0 for *vector_width_half* if the device does not have fp16.
+    if (!(Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16))
+      return ReturnValue(uint32_t{0});
     return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
   case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
     // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes);

From 122ba914189bde4df28ed7a00928445538c5eb0b Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 10 Dec 2024 20:28:02 +0000
Subject: [PATCH 106/148] [L0 v2] add missing catches to usm functions

---
 source/adapters/level_zero/v2/usm.cpp | 48 ++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/source/adapters/level_zero/v2/usm.cpp b/source/adapters/level_zero/v2/usm.cpp
index f7396e282f..1502ac7b5c 100644
--- a/source/adapters/level_zero/v2/usm.cpp
+++ b/source/adapters/level_zero/v2/usm.cpp
@@ -236,18 +236,26 @@ ur_result_t urUSMPoolCreate(
 
 ur_result_t
 urUSMPoolRetain(ur_usm_pool_handle_t hPool ///< [in] pointer to USM memory pool
-) {
+                ) try {
   hPool->RefCount.increment();
   return UR_RESULT_SUCCESS;
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t
 urUSMPoolRelease(ur_usm_pool_handle_t hPool ///< [in] pointer to USM memory pool
-) {
+                 ) try {
   if (hPool->RefCount.decrementAndTest()) {
     delete hPool;
   }
   return UR_RESULT_SUCCESS;
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t urUSMPoolGetInfo(
@@ -258,7 +266,7 @@ ur_result_t urUSMPoolGetInfo(
                       ///< property
     size_t
         *pPropSizeRet ///< [out] size in bytes returned in pool property value
-) {
+    ) try {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
@@ -272,6 +280,10 @@ ur_result_t urUSMPoolGetInfo(
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   }
   }
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t urUSMDeviceAlloc(
@@ -284,13 +296,17 @@ ur_result_t urUSMDeviceAlloc(
     size_t
         size, ///< [in] size in bytes of the USM memory object to be allocated
     void **ppRetMem ///< [out] pointer to USM device memory object
-) {
+    ) try {
   if (!hPool) {
     hPool = hContext->getDefaultUSMPool();
   }
 
   return hPool->allocate(hContext, hDevice, pUSMDesc, UR_USM_TYPE_DEVICE, size,
                          ppRetMem);
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t urUSMSharedAlloc(
@@ -303,13 +319,17 @@ ur_result_t urUSMSharedAlloc(
     size_t
         size, ///< [in] size in bytes of the USM memory object to be allocated
     void **ppRetMem ///< [out] pointer to USM shared memory object
-) {
+    ) try {
   if (!hPool) {
     hPool = hContext->getDefaultUSMPool();
   }
 
   return hPool->allocate(hContext, hDevice, pUSMDesc, UR_USM_TYPE_SHARED, size,
                          ppRetMem);
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t urUSMHostAlloc(
@@ -321,21 +341,29 @@ ur_result_t urUSMHostAlloc(
     size_t
         size, ///< [in] size in bytes of the USM memory object to be allocated
     void **ppRetMem ///< [out] pointer to USM host memory object
-) {
+    ) try {
   if (!hPool) {
     hPool = hContext->getDefaultUSMPool();
   }
 
   return hPool->allocate(hContext, nullptr, pUSMDesc, UR_USM_TYPE_HOST, size,
                          ppRetMem);
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t
 urUSMFree(ur_context_handle_t hContext, ///< [in] handle of the context object
           void *pMem                    ///< [in] pointer to USM memory object
-) {
+          ) try {
   std::ignore = hContext;
   return umf::umf2urResult(umfFree(pMem));
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 
 ur_result_t urUSMGetMemAllocInfo(
@@ -348,7 +376,7 @@ ur_result_t urUSMGetMemAllocInfo(
     void *pPropValue, ///< [out][optional] value of the USM allocation property
     size_t *pPropValueSizeRet ///< [out][optional] bytes returned in USM
                               ///< allocation property
-) {
+    ) try {
   ze_device_handle_t zeDeviceHandle;
   ZeStruct<ze_memory_allocation_properties_t> zeMemoryAllocationProperties;
 
@@ -412,5 +440,9 @@ ur_result_t urUSMGetMemAllocInfo(
   }
   }
   return UR_RESULT_SUCCESS;
+} catch (umf_result_t e) {
+  return umf::umf2urResult(e);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
 }
 } // namespace ur::level_zero

From 5a816c5b92a051e32b1e00304c1771d6c4c4efe9 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Tue, 19 Nov 2024 21:21:49 +0000
Subject: [PATCH 107/148] [L0 v2] event improvements

- Implement native handle API for events
- Always allocate events as multi-device when using provider_normal,
  as all events can be used on multiple devices
---
 source/adapters/level_zero/v2/api.cpp         |  15 -
 source/adapters/level_zero/v2/context.cpp     |  17 +-
 source/adapters/level_zero/v2/context.hpp     |   4 +
 source/adapters/level_zero/v2/event.cpp       | 266 ++++++++++++------
 source/adapters/level_zero/v2/event.hpp       |  80 +++++-
 source/adapters/level_zero/v2/event_pool.cpp  |  14 +-
 source/adapters/level_zero/v2/event_pool.hpp  |  18 +-
 .../level_zero/v2/event_pool_cache.cpp        |  19 +-
 .../level_zero/v2/event_pool_cache.hpp        |   5 +-
 .../adapters/level_zero/v2/event_provider.hpp |   1 -
 .../level_zero/v2/event_provider_counter.cpp  |   5 +-
 .../level_zero/v2/event_provider_counter.hpp  |   4 -
 .../level_zero/v2/event_provider_normal.cpp   |  17 +-
 .../level_zero/v2/event_provider_normal.hpp   |  16 +-
 .../v2/queue_immediate_in_order.cpp           |   3 +-
 test/adapters/level_zero/CMakeLists.txt       |  84 +++---
 .../urEventCreateWithNativeHandle.cpp         |  66 ++---
 .../level_zero/v2/event_pool_test.cpp         | 113 ++++++--
 test/adapters/level_zero/ze_helpers.hpp       |  48 ++++
 19 files changed, 540 insertions(+), 255 deletions(-)
 create mode 100644 test/adapters/level_zero/ze_helpers.hpp

diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index 593115a99f..eaec082055 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -181,21 +181,6 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel,
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent,
-                                   ur_native_handle_t *phNativeEvent) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t
-urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent,
-                              ur_context_handle_t hContext,
-                              const ur_event_native_properties_t *pProperties,
-                              ur_event_handle_t *phEvent) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 ur_result_t urEventSetCallback(ur_event_handle_t hEvent,
                                ur_execution_info_t execStatus,
                                ur_event_callback_t pfnNotify, void *pUserData) {
diff --git a/source/adapters/level_zero/v2/context.cpp b/source/adapters/level_zero/v2/context.cpp
index 89e77c98e9..cd4d8bfc9d 100644
--- a/source/adapters/level_zero/v2/context.cpp
+++ b/source/adapters/level_zero/v2/context.cpp
@@ -11,6 +11,7 @@
 #include "../device.hpp"
 
 #include "context.hpp"
+#include "event_provider_counter.hpp"
 #include "event_provider_normal.hpp"
 
 static std::vector<ur_device_handle_t>
@@ -48,14 +49,22 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
                                            const ur_device_handle_t *phDevices,
                                            bool ownZeContext)
     : commandListCache(hContext),
-      eventPoolCache(phDevices[0]->Platform->getNumDevices(),
+      eventPoolCache(this, phDevices[0]->Platform->getNumDevices(),
                      [context = this, platform = phDevices[0]->Platform](
-                         DeviceId deviceId, v2::event_flags_t flags) {
-                       auto device = platform->getDeviceById(deviceId);
+                         DeviceId deviceId, v2::event_flags_t flags)
+                         -> std::unique_ptr<v2::event_provider> {
+                       assert((flags & v2::EVENT_FLAGS_COUNTER) != 0);
+
+                       std::ignore = deviceId;
+                       std::ignore = platform;
+
                        // TODO: just use per-context id?
                        return std::make_unique<v2::provider_normal>(
-                           context, device, v2::QUEUE_IMMEDIATE, flags);
+                           context, v2::QUEUE_IMMEDIATE, flags);
                      }),
+      nativeEventsPool(this, std::make_unique<v2::provider_normal>(
+                                 this, v2::QUEUE_IMMEDIATE,
+                                 v2::EVENT_FLAGS_PROFILING_ENABLED)),
       hContext(hContext, ownZeContext),
       hDevices(phDevices, phDevices + numDevices),
       p2pAccessDevices(populateP2PDevices(
diff --git a/source/adapters/level_zero/v2/context.hpp b/source/adapters/level_zero/v2/context.hpp
index b3ba6050dd..40dc725e86 100644
--- a/source/adapters/level_zero/v2/context.hpp
+++ b/source/adapters/level_zero/v2/context.hpp
@@ -38,6 +38,10 @@ struct ur_context_handle_t_ : _ur_object {
   v2::command_list_cache_t commandListCache;
   v2::event_pool_cache eventPoolCache;
 
+  // pool used for urEventCreateWithNativeHandle when native handle is NULL
+  // (uses non-counter based events to allow for signaling from host)
+  v2::event_pool nativeEventsPool;
+
 private:
   const v2::raii::ze_context_handle_t hContext;
   const std::vector<ur_device_handle_t> hDevices;
diff --git a/source/adapters/level_zero/v2/event.cpp b/source/adapters/level_zero/v2/event.cpp
index f3319351e3..7dec824572 100644
--- a/source/adapters/level_zero/v2/event.cpp
+++ b/source/adapters/level_zero/v2/event.cpp
@@ -10,6 +10,7 @@
 
 #include <ze_api.h>
 
+#include "context.hpp"
 #include "event.hpp"
 #include "event_pool.hpp"
 #include "event_provider.hpp"
@@ -17,32 +18,118 @@
 
 #include "../ur_interface_loader.hpp"
 
-ur_event_handle_t_::ur_event_handle_t_(
-    v2::raii::cache_borrowed_event eventAllocation, v2::event_pool *pool)
-    : zeEvent(std::move(eventAllocation)), pool(pool),
-      adjustedEventStartTimestamp(0), recordEventEndTimestamp(0),
-      adjustedEventEndTimestamp(0),
-      zeTimerResolution(getDevice()->ZeDeviceProperties->timerResolution),
-      timestampMaxValue(getDevice()->getTimestampMask()) {}
+static uint64_t adjustEndEventTimestamp(uint64_t adjustedStartTimestamp,
+                                        uint64_t endTimestamp,
+                                        uint64_t timestampMaxValue,
+                                        uint64_t timerResolution) {
+  // End time needs to be adjusted for resolution and valid bits.
+  uint64_t adjustedTimestamp =
+      (endTimestamp & timestampMaxValue) * timerResolution;
+
+  // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+  // Note, it will not report correct time if there were multiple wrap
+  // arounds, and the longer term plan is to enlarge the capacity of the
+  // HW timestamps.
+  if (adjustedTimestamp < adjustedStartTimestamp)
+    adjustedTimestamp += timestampMaxValue * timerResolution;
+
+  return adjustedTimestamp;
+}
+
+uint64_t event_profiling_data_t::getEventEndTimestamp() {
+  // If adjustedEventEndTimestamp on the event is non-zero it means it has
+  // collected the result of the queue already. In that case it has been
+  // adjusted and is ready for immediate return.
+  if (adjustedEventEndTimestamp)
+    return adjustedEventEndTimestamp;
+
+  auto status = zeEventQueryStatus(hZeEvent);
+  if (status != ZE_RESULT_SUCCESS) {
+    // profiling info not ready
+    return 0;
+  }
+
+  assert(zeTimerResolution);
+  assert(timestampMaxValue);
+
+  adjustedEventEndTimestamp = adjustEndEventTimestamp(
+      adjustedEventStartTimestamp, recordEventEndTimestamp, timestampMaxValue,
+      zeTimerResolution);
+
+  return adjustedEventEndTimestamp;
+}
+
+void event_profiling_data_t::recordStartTimestamp(ur_device_handle_t hDevice) {
+  zeTimerResolution = hDevice->ZeDeviceProperties->timerResolution;
+  timestampMaxValue = hDevice->getTimestampMask();
+
+  uint64_t deviceStartTimestamp = 0;
+  UR_CALL_THROWS(ur::level_zero::urDeviceGetGlobalTimestamps(
+      hDevice, &deviceStartTimestamp, nullptr));
+
+  assert(adjustedEventStartTimestamp == 0);
+  adjustedEventStartTimestamp = deviceStartTimestamp;
+}
+
+uint64_t event_profiling_data_t::getEventStartTimestmap() const {
+  return adjustedEventStartTimestamp;
+}
+
+bool event_profiling_data_t::recordingEnded() const {
+  return adjustedEventEndTimestamp != 0;
+}
+
+bool event_profiling_data_t::recordingStarted() const {
+  return adjustedEventStartTimestamp != 0;
+}
+
+uint64_t *event_profiling_data_t::eventEndTimestampAddr() {
+  return &recordEventEndTimestamp;
+}
+
+ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t hContext,
+                                       ze_event_handle_t hZeEvent,
+                                       v2::event_flags_t flags)
+    : hContext(hContext), hZeEvent(hZeEvent), flags(flags),
+      profilingData(hZeEvent) {}
 
 void ur_event_handle_t_::resetQueueAndCommand(ur_queue_handle_t hQueue,
                                               ur_command_t commandType) {
   this->hQueue = hQueue;
   this->commandType = commandType;
+  profilingData = event_profiling_data_t(hZeEvent);
+}
+
+void ur_event_handle_t_::recordStartTimestamp() {
+  assert(hQueue); // queue must be set before calling this
+
+  ur_device_handle_t hDevice;
+  UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice),
+                                      reinterpret_cast<void *>(&hDevice),
+                                      nullptr));
+
+  profilingData.recordStartTimestamp(hDevice);
+}
+
+uint64_t ur_event_handle_t_::getEventStartTimestmap() const {
+  return profilingData.getEventStartTimestmap();
+}
+
+uint64_t ur_event_handle_t_::getEventEndTimestamp() {
+  return profilingData.getEventEndTimestamp();
 }
 
 void ur_event_handle_t_::reset() {
-  // consider making an abstraction for regular/counter based
+  // consider make an abstraction for regular/counter based
   // events if there's more of this type of conditions
-  if (!(pool->getFlags() & v2::EVENT_FLAGS_COUNTER)) {
-    zeEventHostReset(zeEvent.get());
+  if (!(flags & v2::EVENT_FLAGS_COUNTER)) {
+    zeEventHostReset(hZeEvent);
   }
 }
 
 ze_event_handle_t ur_event_handle_t_::getZeEvent() const {
-  assert(hQueue);
-  assert(commandType != UR_COMMAND_FORCE_UINT32);
-  return zeEvent.get();
+  assert(hZeEvent);
+  return hZeEvent;
 }
 
 ur_result_t ur_event_handle_t_::retain() {
@@ -51,11 +138,10 @@ ur_result_t ur_event_handle_t_::retain() {
 }
 
 ur_result_t ur_event_handle_t_::releaseDeferred() {
-  assert(zeEventQueryStatus(zeEvent.get()) == ZE_RESULT_SUCCESS);
+  assert(zeEventQueryStatus(hZeEvent) == ZE_RESULT_SUCCESS);
   assert(RefCount.load() == 0);
 
-  pool->free(this);
-  return UR_RESULT_SUCCESS;
+  return this->forceRelease();
 }
 
 ur_result_t ur_event_handle_t_::release() {
@@ -65,93 +151,66 @@ ur_result_t ur_event_handle_t_::release() {
   // Need to take a lock before checking if the event is timestamped.
   std::unique_lock<ur_shared_mutex> lock(Mutex);
 
-  if (isTimestamped() && adjustedEventEndTimestamp == 0) {
+  if (isTimestamped() && !getEventEndTimestamp()) {
     // L0 will write end timestamp to this event some time in the future,
     // so we can't release it yet.
-
     assert(hQueue);
     hQueue->deferEventFree(this);
     return UR_RESULT_SUCCESS;
   }
 
-  pool->free(this);
+  // Need to unlock now, as forceRelease might deallocate memory backing
+  // the Mutex.
+  lock.unlock();
 
-  return UR_RESULT_SUCCESS;
+  return this->forceRelease();
 }
 
 bool ur_event_handle_t_::isTimestamped() const {
-  // If we are recording, the start time of the event will be non-zero.
-  return adjustedEventStartTimestamp != 0;
+  return profilingData.recordingStarted();
 }
 
 bool ur_event_handle_t_::isProfilingEnabled() const {
-  return pool->getFlags() & v2::EVENT_FLAGS_PROFILING_ENABLED;
+  return flags & v2::EVENT_FLAGS_PROFILING_ENABLED;
 }
 
-ur_device_handle_t ur_event_handle_t_::getDevice() const {
-  return pool->getProvider()->device();
-}
-
-uint64_t ur_event_handle_t_::getEventStartTimestmap() const {
-  return adjustedEventStartTimestamp;
-}
-
-static uint64_t adjustEndEventTimestamp(uint64_t adjustedStartTimestamp,
-                                        uint64_t endTimestamp,
-                                        uint64_t timestampMaxValue,
-                                        uint64_t timerResolution) {
-  // End time needs to be adjusted for resolution and valid bits.
-  uint64_t adjustedTimestamp =
-      (endTimestamp & timestampMaxValue) * timerResolution;
-
-  // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
-  // Note, it will not report correct time if there were multiple wrap
-  // arounds, and the longer term plan is to enlarge the capacity of the
-  // HW timestamps.
-  if (adjustedTimestamp < adjustedStartTimestamp)
-    adjustedTimestamp += timestampMaxValue * timerResolution;
-
-  return adjustedTimestamp;
+std::pair<uint64_t *, ze_event_handle_t>
+ur_event_handle_t_::getEventEndTimestampAndHandle() {
+  return {profilingData.eventEndTimestampAddr(), hZeEvent};
 }
 
-uint64_t ur_event_handle_t_::getEventEndTimestamp() {
-  // If adjustedEventEndTimestamp on the event is non-zero it means it has
-  // collected the result of the queue already. In that case it has been
-  // adjusted and is ready for immediate return.
-  if (adjustedEventEndTimestamp)
-    return adjustedEventEndTimestamp;
-
-  auto status = zeEventQueryStatus(zeEvent.get());
-  if (status != ZE_RESULT_SUCCESS) {
-    // profiling info not ready
-    return 0;
-  }
+ur_queue_handle_t ur_event_handle_t_::getQueue() const { return hQueue; }
 
-  adjustedEventEndTimestamp =
-      adjustEndEventTimestamp(getEventStartTimestmap(), recordEventEndTimestamp,
-                              timestampMaxValue, zeTimerResolution);
+ur_context_handle_t ur_event_handle_t_::getContext() const { return hContext; }
 
-  return adjustedEventEndTimestamp;
-}
+ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; }
 
-void ur_event_handle_t_::recordStartTimestamp() {
-  uint64_t deviceStartTimestamp = 0;
-  UR_CALL_THROWS(ur::level_zero::urDeviceGetGlobalTimestamps(
-      getDevice(), &deviceStartTimestamp, nullptr));
+ur_pooled_event_t::ur_pooled_event_t(
+    ur_context_handle_t hContext,
+    v2::raii::cache_borrowed_event eventAllocation, v2::event_pool *pool)
+    : ur_event_handle_t_(hContext, eventAllocation.get(), pool->getFlags()),
+      zeEvent(std::move(eventAllocation)), pool(pool) {}
 
-  assert(adjustedEventStartTimestamp == 0);
-  adjustedEventStartTimestamp = deviceStartTimestamp;
+ur_result_t ur_pooled_event_t::forceRelease() {
+  pool->free(this);
+  return UR_RESULT_SUCCESS;
 }
 
-std::pair<uint64_t *, ze_event_handle_t>
-ur_event_handle_t_::getEventEndTimestampAndHandle() {
-  return {&recordEventEndTimestamp, zeEvent.get()};
+ur_native_event_t::ur_native_event_t(
+    ur_native_handle_t hNativeEvent, ur_context_handle_t hContext,
+    const ur_event_native_properties_t *pProperties)
+    : ur_event_handle_t_(
+          hContext,
+          reinterpret_cast<ze_event_handle_t>(hNativeEvent), v2::EVENT_FLAGS_PROFILING_ENABLED /* TODO: this follows legacy adapter logic, we could check this with zeEventGetPool */),
+      zeEvent(reinterpret_cast<ze_event_handle_t>(hNativeEvent),
+              pProperties ? pProperties->isNativeHandleOwned : false) {}
+
+ur_result_t ur_native_event_t::forceRelease() {
+  zeEvent.release();
+  delete this;
+  return UR_RESULT_SUCCESS;
 }
 
-ur_queue_handle_t ur_event_handle_t_::getQueue() const { return hQueue; }
-
-ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; }
-
 namespace ur::level_zero {
 ur_result_t urEventRetain(ur_event_handle_t hEvent) try {
   return hEvent->retain();
@@ -195,14 +254,10 @@ ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName,
     return returnValue(hEvent->RefCount.load());
   }
   case UR_EVENT_INFO_COMMAND_QUEUE: {
-    return returnValue(ur_queue_handle_t{hEvent->getQueue()});
+    return returnValue(hEvent->getQueue());
   }
   case UR_EVENT_INFO_CONTEXT: {
-    ur_context_handle_t hContext;
-    UR_CALL(::ur::level_zero::urQueueGetInfo(
-        hEvent->getQueue(), UR_QUEUE_INFO_CONTEXT, sizeof(hContext),
-        reinterpret_cast<void *>(&hContext), nullptr));
-    return returnValue(hContext);
+    return returnValue(hEvent->getContext());
   }
   case UR_EVENT_INFO_COMMAND_TYPE: {
     return returnValue(hEvent->getCommandType());
@@ -258,11 +313,21 @@ ur_result_t urEventGetProfilingInfo(
     }
   }
 
+  auto hQueue = hEvent->getQueue();
+  if (!hQueue) {
+    // no command has been enqueued with this event yet
+    return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
   ze_kernel_timestamp_result_t tsResult;
 
-  auto zeTimerResolution =
-      hEvent->getDevice()->ZeDeviceProperties->timerResolution;
-  auto timestampMaxValue = hEvent->getDevice()->getTimestampMask();
+  ur_device_handle_t hDevice;
+  UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice),
+                                      reinterpret_cast<void *>(&hDevice),
+                                      nullptr));
+
+  auto zeTimerResolution = hDevice->ZeDeviceProperties->timerResolution;
+  auto timestampMaxValue = hDevice->getTimestampMask();
 
   switch (propName) {
   case UR_PROFILING_INFO_COMMAND_START: {
@@ -281,6 +346,7 @@ ur_result_t urEventGetProfilingInfo(
     auto adjustedEndTime =
         adjustEndEventTimestamp(contextStartTime, tsResult.global.kernelEnd,
                                 timestampMaxValue, zeTimerResolution);
+
     return returnValue(adjustedEndTime);
   }
   case UR_PROFILING_INFO_COMMAND_QUEUED:
@@ -300,4 +366,32 @@ ur_result_t urEventGetProfilingInfo(
 } catch (...) {
   return exceptionToResult(std::current_exception());
 }
+
+ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent,
+                                   ur_native_handle_t *phNativeEvent) try {
+  *phNativeEvent = reinterpret_cast<ur_native_handle_t>(hEvent->getZeEvent());
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t
+urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent,
+                              ur_context_handle_t hContext,
+                              const ur_event_native_properties_t *pProperties,
+                              ur_event_handle_t *phEvent) try {
+  if (!hNativeEvent) {
+    assert((hContext->nativeEventsPool.getFlags() & v2::EVENT_FLAGS_COUNTER) ==
+           0);
+
+    *phEvent = hContext->nativeEventsPool.allocate();
+    ZE2UR_CALL(zeEventHostSignal, ((*phEvent)->getZeEvent()));
+  } else {
+    *phEvent = new ur_native_event_t(hNativeEvent, hContext, pProperties);
+  }
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 } // namespace ur::level_zero
diff --git a/source/adapters/level_zero/v2/event.hpp b/source/adapters/level_zero/v2/event.hpp
index 3c22ef1337..9e2331c649 100644
--- a/source/adapters/level_zero/v2/event.hpp
+++ b/source/adapters/level_zero/v2/event.hpp
@@ -22,18 +22,47 @@ namespace v2 {
 class event_pool;
 }
 
+struct event_profiling_data_t {
+  event_profiling_data_t(ze_event_handle_t hZeEvent) : hZeEvent(hZeEvent) {}
+
+  void recordStartTimestamp(ur_device_handle_t hDevice);
+  uint64_t getEventStartTimestmap() const;
+
+  uint64_t getEventEndTimestamp();
+  uint64_t *eventEndTimestampAddr();
+
+  bool recordingStarted() const;
+  bool recordingEnded() const;
+
+private:
+  ze_event_handle_t hZeEvent;
+
+  uint64_t adjustedEventStartTimestamp = 0;
+  uint64_t recordEventEndTimestamp = 0;
+  uint64_t adjustedEventEndTimestamp = 0;
+
+  uint64_t zeTimerResolution = 0;
+  uint64_t timestampMaxValue = 0;
+};
+
 struct ur_event_handle_t_ : _ur_object {
 public:
-  ur_event_handle_t_(v2::raii::cache_borrowed_event eventAllocation,
-                     v2::event_pool *pool);
+  ur_event_handle_t_(ur_context_handle_t hContext, ze_event_handle_t hZeEvent,
+                     v2::event_flags_t flags);
 
   // Set the queue and command that this event is associated with
   void resetQueueAndCommand(ur_queue_handle_t hQueue, ur_command_t commandType);
 
+  // releases event immediately
+  virtual ur_result_t forceRelease() = 0;
+  virtual ~ur_event_handle_t_() = default;
+
   void reset();
   ze_event_handle_t getZeEvent() const;
 
   ur_result_t retain();
+
+  // releases event immediately, or adds to a list for deffered deletion
   ur_result_t release();
 
   // releases a signaled and no longer in-use event, that's on the
@@ -47,15 +76,18 @@ struct ur_event_handle_t_ : _ur_object {
   // Tells if this event comes from a pool that has profiling enabled.
   bool isProfilingEnabled() const;
 
-  // Device associated with this event
-  ur_device_handle_t getDevice() const;
-
-  // Queue associated with this event
+  // Queue associated with this event. Can be nullptr (for native events)
   ur_queue_handle_t getQueue() const;
 
+  // Context associated with this event
+  ur_context_handle_t getContext() const;
+
   // Get the type of the command that this event is associated with
   ur_command_t getCommandType() const;
 
+  // Record the start timestamp of the event, to be obtained by
+  // urEventGetProfilingInfo. resetQueueAndCommand should be
+  // called before this.
   void recordStartTimestamp();
 
   // Get pointer to the end timestamp, and ze event handle.
@@ -65,16 +97,40 @@ struct ur_event_handle_t_ : _ur_object {
   uint64_t getEventStartTimestmap() const;
   uint64_t getEventEndTimestamp();
 
-private:
+protected:
+  ur_context_handle_t hContext;
+
+  // non-owning handle to the L0 event
+  const ze_event_handle_t hZeEvent;
+
+  // queue and commandType that this event is associated with, set by enqueue
+  // commands
   ur_queue_handle_t hQueue = nullptr;
   ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
+
+  v2::event_flags_t flags;
+  event_profiling_data_t profilingData;
+};
+
+struct ur_pooled_event_t : ur_event_handle_t_ {
+  ur_pooled_event_t(ur_context_handle_t hContext,
+                    v2::raii::cache_borrowed_event eventAllocation,
+                    v2::event_pool *pool);
+
+  ur_result_t forceRelease() override;
+
+private:
   v2::raii::cache_borrowed_event zeEvent;
   v2::event_pool *pool;
+};
+
+struct ur_native_event_t : ur_event_handle_t_ {
+  ur_native_event_t(ur_native_handle_t hNativeEvent,
+                    ur_context_handle_t hContext,
+                    const ur_event_native_properties_t *pProperties);
 
-  uint64_t adjustedEventStartTimestamp;
-  uint64_t recordEventEndTimestamp;
-  uint64_t adjustedEventEndTimestamp;
+  ur_result_t forceRelease() override;
 
-  const uint64_t zeTimerResolution;
-  const uint64_t timestampMaxValue;
+private:
+  v2::raii::ze_event_handle_t zeEvent;
 };
diff --git a/source/adapters/level_zero/v2/event_pool.cpp b/source/adapters/level_zero/v2/event_pool.cpp
index 523aaf7fb9..d7e1d451ac 100644
--- a/source/adapters/level_zero/v2/event_pool.cpp
+++ b/source/adapters/level_zero/v2/event_pool.cpp
@@ -9,14 +9,15 @@
 //===----------------------------------------------------------------------===//
 #include "event_pool.hpp"
 #include "common/latency_tracker.hpp"
+#include "event.hpp"
+#include "queue_api.hpp"
 #include "ur_api.h"
 
 namespace v2 {
 
 static constexpr size_t EVENTS_BURST = 64;
 
-ur_event_handle_t_ *event_pool::allocate(ur_queue_handle_t hQueue,
-                                         ur_command_t commandType) {
+ur_pooled_event_t *event_pool::allocate() {
   TRACK_SCOPE_LATENCY("event_pool::allocate");
 
   std::unique_lock<std::mutex> lock(*mutex);
@@ -25,7 +26,7 @@ ur_event_handle_t_ *event_pool::allocate(ur_queue_handle_t hQueue,
     auto start = events.size();
     auto end = start + EVENTS_BURST;
     for (; start < end; ++start) {
-      events.emplace_back(provider->allocate(), this);
+      events.emplace_back(hContext, provider->allocate(), this);
       freelist.push_back(&events.at(start));
     }
   }
@@ -33,12 +34,15 @@ ur_event_handle_t_ *event_pool::allocate(ur_queue_handle_t hQueue,
   auto event = freelist.back();
   freelist.pop_back();
 
-  event->resetQueueAndCommand(hQueue, commandType);
+#ifndef NDEBUG
+  // Set the command type to an invalid value to catch any misuses in tests
+  event->resetQueueAndCommand(nullptr, UR_COMMAND_FORCE_UINT32);
+#endif
 
   return event;
 }
 
-void event_pool::free(ur_event_handle_t_ *event) {
+void event_pool::free(ur_pooled_event_t *event) {
   TRACK_SCOPE_LATENCY("event_pool::free");
 
   std::unique_lock<std::mutex> lock(*mutex);
diff --git a/source/adapters/level_zero/v2/event_pool.hpp b/source/adapters/level_zero/v2/event_pool.hpp
index e9ad4051e6..c78f34b492 100644
--- a/source/adapters/level_zero/v2/event_pool.hpp
+++ b/source/adapters/level_zero/v2/event_pool.hpp
@@ -29,8 +29,10 @@ namespace v2 {
 class event_pool {
 public:
   // store weak reference to the queue as event_pool is part of the queue
-  event_pool(std::unique_ptr<event_provider> Provider)
-      : provider(std::move(Provider)), mutex(std::make_unique<std::mutex>()){};
+  event_pool(ur_context_handle_t hContext,
+             std::unique_ptr<event_provider> Provider)
+      : hContext(hContext), provider(std::move(Provider)),
+        mutex(std::make_unique<std::mutex>()){};
 
   event_pool(event_pool &&other) = default;
   event_pool &operator=(event_pool &&other) = default;
@@ -38,23 +40,21 @@ class event_pool {
   event_pool(const event_pool &) = delete;
   event_pool &operator=(const event_pool &) = delete;
 
-  DeviceId Id() { return provider->device()->Id.value(); };
-
   // Allocate an event from the pool. Thread safe.
-  ur_event_handle_t_ *allocate(ur_queue_handle_t hQueue,
-                               ur_command_t commandType);
+  ur_pooled_event_t *allocate();
 
   // Free an event back to the pool. Thread safe.
-  void free(ur_event_handle_t_ *event);
+  void free(ur_pooled_event_t *event);
 
   event_provider *getProvider() const;
   event_flags_t getFlags() const;
 
 private:
+  ur_context_handle_t hContext;
   std::unique_ptr<event_provider> provider;
 
-  std::deque<ur_event_handle_t_> events;
-  std::vector<ur_event_handle_t_ *> freelist;
+  std::deque<ur_pooled_event_t> events;
+  std::vector<ur_pooled_event_t *> freelist;
 
   std::unique_ptr<std::mutex> mutex;
 };
diff --git a/source/adapters/level_zero/v2/event_pool_cache.cpp b/source/adapters/level_zero/v2/event_pool_cache.cpp
index f0d16bed02..620ac0867d 100644
--- a/source/adapters/level_zero/v2/event_pool_cache.cpp
+++ b/source/adapters/level_zero/v2/event_pool_cache.cpp
@@ -13,14 +13,13 @@
 
 namespace v2 {
 
-event_pool_cache::event_pool_cache(size_t max_devices,
+event_pool_cache::event_pool_cache(ur_context_handle_t hContext,
+                                   size_t max_devices,
                                    ProviderCreateFunc ProviderCreate)
-    : providerCreate(ProviderCreate) {
+    : hContext(hContext), providerCreate(ProviderCreate) {
   pools.resize(max_devices * (1ULL << EVENT_FLAGS_USED_BITS));
 }
 
-event_pool_cache::~event_pool_cache() {}
-
 raii::cache_borrowed_event_pool event_pool_cache::borrow(DeviceId id,
                                                          event_flags_t flags) {
   std::unique_lock<ur_mutex> Lock(mutex);
@@ -33,16 +32,18 @@ raii::cache_borrowed_event_pool event_pool_cache::borrow(DeviceId id,
 
   auto &vec = pools[event_desc.index()];
   if (vec.empty()) {
-    vec.emplace_back(std::make_unique<event_pool>(providerCreate(id, flags)));
+    vec.emplace_back(
+        std::make_unique<event_pool>(hContext, providerCreate(id, flags)));
   }
 
   auto pool = vec.back().release();
   vec.pop_back();
 
-  return raii::cache_borrowed_event_pool(pool, [this, flags](event_pool *pool) {
-    std::unique_lock<ur_mutex> Lock(mutex);
-    pools[event_descriptor{pool->Id(), flags}.index()].emplace_back(pool);
-  });
+  return raii::cache_borrowed_event_pool(
+      pool, [this, id, flags](event_pool *pool) {
+        std::unique_lock<ur_mutex> Lock(mutex);
+        pools[event_descriptor{id, flags}.index()].emplace_back(pool);
+      });
 }
 
 } // namespace v2
diff --git a/source/adapters/level_zero/v2/event_pool_cache.hpp b/source/adapters/level_zero/v2/event_pool_cache.hpp
index 78d909182c..feff5d325e 100644
--- a/source/adapters/level_zero/v2/event_pool_cache.hpp
+++ b/source/adapters/level_zero/v2/event_pool_cache.hpp
@@ -35,12 +35,13 @@ class event_pool_cache {
   using ProviderCreateFunc = std::function<std::unique_ptr<event_provider>(
       DeviceId, event_flags_t flags)>;
 
-  event_pool_cache(size_t max_devices, ProviderCreateFunc);
-  ~event_pool_cache();
+  event_pool_cache(ur_context_handle_t hContext, size_t max_devices,
+                   ProviderCreateFunc);
 
   raii::cache_borrowed_event_pool borrow(DeviceId, event_flags_t flags);
 
 private:
+  ur_context_handle_t hContext;
   ur_mutex mutex;
   ProviderCreateFunc providerCreate;
 
diff --git a/source/adapters/level_zero/v2/event_provider.hpp b/source/adapters/level_zero/v2/event_provider.hpp
index 1fb87a8b6a..c6bedb8fc1 100644
--- a/source/adapters/level_zero/v2/event_provider.hpp
+++ b/source/adapters/level_zero/v2/event_provider.hpp
@@ -40,7 +40,6 @@ class event_provider {
 public:
   virtual ~event_provider() = default;
   virtual raii::cache_borrowed_event allocate() = 0;
-  virtual ur_device_handle_t device() = 0;
   virtual event_flags_t eventFlags() const = 0;
 };
 
diff --git a/source/adapters/level_zero/v2/event_provider_counter.cpp b/source/adapters/level_zero/v2/event_provider_counter.cpp
index 8d7ecf8227..886fd53db4 100644
--- a/source/adapters/level_zero/v2/event_provider_counter.cpp
+++ b/source/adapters/level_zero/v2/event_provider_counter.cpp
@@ -22,8 +22,7 @@ namespace v2 {
 
 provider_counter::provider_counter(ur_platform_handle_t platform,
                                    ur_context_handle_t context,
-                                   ur_device_handle_t device)
-    : urDevice(device) {
+                                   ur_device_handle_t device) {
   ZE2UR_CALL_THROWS(zeDriverGetExtensionFunctionAddress,
                     (platform->ZeDriver, "zexCounterBasedEventCreate",
                      (void **)&this->eventCreateFunc));
@@ -58,8 +57,6 @@ raii::cache_borrowed_event provider_counter::allocate() {
       [this](ze_event_handle_t handle) { freelist.push_back(handle); });
 }
 
-ur_device_handle_t provider_counter::device() { return urDevice; }
-
 event_flags_t provider_counter::eventFlags() const {
   return EVENT_FLAGS_COUNTER;
 }
diff --git a/source/adapters/level_zero/v2/event_provider_counter.hpp b/source/adapters/level_zero/v2/event_provider_counter.hpp
index 98e405cc3f..bb46cb5daf 100644
--- a/source/adapters/level_zero/v2/event_provider_counter.hpp
+++ b/source/adapters/level_zero/v2/event_provider_counter.hpp
@@ -34,17 +34,13 @@ typedef ze_result_t (*zexCounterBasedEventCreate)(
 
 class provider_counter : public event_provider {
 public:
-  // TODO: does this provider support profiling?
   provider_counter(ur_platform_handle_t platform, ur_context_handle_t,
                    ur_device_handle_t);
 
   raii::cache_borrowed_event allocate() override;
-  ur_device_handle_t device() override;
   event_flags_t eventFlags() const override;
 
 private:
-  ur_device_handle_t urDevice;
-
   ze_context_handle_t translatedContext;
   ze_device_handle_t translatedDevice;
 
diff --git a/source/adapters/level_zero/v2/event_provider_normal.cpp b/source/adapters/level_zero/v2/event_provider_normal.cpp
index 4e2ab91698..029b95071b 100644
--- a/source/adapters/level_zero/v2/event_provider_normal.cpp
+++ b/source/adapters/level_zero/v2/event_provider_normal.cpp
@@ -24,8 +24,7 @@
 namespace v2 {
 static constexpr int EVENTS_BURST = 64;
 
-provider_pool::provider_pool(ur_context_handle_t context,
-                             ur_device_handle_t device, queue_type queue,
+provider_pool::provider_pool(ur_context_handle_t context, queue_type queue,
                              event_flags_t flags) {
   ZeStruct<ze_event_pool_desc_t> desc;
   desc.count = EVENTS_BURST;
@@ -46,10 +45,14 @@ provider_pool::provider_pool(ur_context_handle_t context,
     desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
   }
 
+  std::vector<ze_device_handle_t> devices;
+  for (auto &d : context->getDevices()) {
+    devices.push_back(d->ZeDevice);
+  }
+
   ZE2UR_CALL_THROWS(zeEventPoolCreate,
-                    (context->getZeHandle(), &desc, 1,
-                     const_cast<ze_device_handle_t *>(&device->ZeDevice),
-                     pool.ptr()));
+                    (context->getZeHandle(), &desc, devices.size(),
+                     devices.data(), pool.ptr()));
 
   freelist.resize(EVENTS_BURST);
   for (int i = 0; i < EVENTS_BURST; ++i) {
@@ -75,7 +78,7 @@ raii::cache_borrowed_event provider_pool::allocate() {
 size_t provider_pool::nfree() const { return freelist.size(); }
 
 std::unique_ptr<provider_pool> provider_normal::createProviderPool() {
-  return std::make_unique<provider_pool>(urContext, urDevice, queueType, flags);
+  return std::make_unique<provider_pool>(urContext, queueType, flags);
 }
 
 raii::cache_borrowed_event provider_normal::allocate() {
@@ -110,8 +113,6 @@ raii::cache_borrowed_event provider_normal::allocate() {
   return allocate();
 }
 
-ur_device_handle_t provider_normal::device() { return urDevice; }
-
 event_flags_t provider_normal::eventFlags() const { return flags; }
 
 } // namespace v2
diff --git a/source/adapters/level_zero/v2/event_provider_normal.hpp b/source/adapters/level_zero/v2/event_provider_normal.hpp
index a0f672b944..811b32f2e2 100644
--- a/source/adapters/level_zero/v2/event_provider_normal.hpp
+++ b/source/adapters/level_zero/v2/event_provider_normal.hpp
@@ -34,8 +34,7 @@ enum queue_type {
 
 class provider_pool {
 public:
-  provider_pool(ur_context_handle_t, ur_device_handle_t, queue_type,
-                event_flags_t flags);
+  provider_pool(ur_context_handle_t, queue_type, event_flags_t flags);
 
   raii::cache_borrowed_event allocate();
   size_t nfree() const;
@@ -45,24 +44,19 @@ class provider_pool {
   std::vector<raii::ze_event_handle_t> freelist;
 };
 
+// supplies multi-device events for a given context
 class provider_normal : public event_provider {
 public:
-  provider_normal(ur_context_handle_t context, ur_device_handle_t device,
-                  queue_type qtype, event_flags_t flags)
-      : queueType(qtype), urContext(context), urDevice(device), flags(flags) {
-    ur::level_zero::urDeviceRetain(device);
-  }
-
-  ~provider_normal() override { ur::level_zero::urDeviceRelease(urDevice); }
+  provider_normal(ur_context_handle_t context, queue_type qtype,
+                  event_flags_t flags)
+      : queueType(qtype), urContext(context), flags(flags) {}
 
   raii::cache_borrowed_event allocate() override;
-  ur_device_handle_t device() override;
   event_flags_t eventFlags() const override;
 
 private:
   queue_type queueType;
   ur_context_handle_t urContext;
-  ur_device_handle_t urDevice;
   event_flags_t flags;
 
   std::unique_ptr<provider_pool> createProviderPool();
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 05e48c8740..d34fef4aa7 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -105,7 +105,8 @@ ur_event_handle_t
 ur_queue_immediate_in_order_t::getSignalEvent(ur_event_handle_t *hUserEvent,
                                               ur_command_t commandType) {
   if (hUserEvent) {
-    *hUserEvent = eventPool->allocate(this, commandType);
+    *hUserEvent = eventPool->allocate();
+    (*hUserEvent)->resetQueueAndCommand(this, commandType);
     return *hUserEvent;
   } else {
     return nullptr;
diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt
index d74d08311b..97217a1f3b 100644
--- a/test/adapters/level_zero/CMakeLists.txt
+++ b/test/adapters/level_zero/CMakeLists.txt
@@ -3,82 +3,96 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-if(UR_BUILD_ADAPTER_L0)
+function(add_adapter_tests adapter)
     if(NOT UR_DPCXX)
         # Tests that require kernels can't be used if we aren't generating
         # device binaries
         message(WARNING
-            "UR_DPCXX is not defined, skipping some adapter tests for level_zero")
+            "UR_DPCXX is not defined, skipping some adapter tests for ${adapter}")
     else()
-        add_adapter_test(level_zero
+        add_adapter_test(${adapter}
             FIXTURE KERNELS
             SOURCES
                 urProgramLink.cpp
                 urKernelCreateWithNativeHandle.cpp
                 urEventCreateWithNativeHandle.cpp
             ENVIRONMENT
-                "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
+                "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_${adapter}>\""
         )
         # TODO: valgrind tests require very new environment.
         # Enable once all L0 runners are updated.
-        # add_adapter_memcheck_test(level_zero
+        # add_adapter_memcheck_test(${adapter}
         #    ENVIRONMENT
-        #         "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
+        #         "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_${adapter}>\""
         # )
 
-        target_link_libraries(test-adapter-level_zero PRIVATE
+        target_link_libraries(test-adapter-${adapter} PRIVATE
             LevelZeroLoader
             LevelZeroLoader-Headers
             ComputeRuntimeLevelZero-Headers
         )
 
-        target_include_directories(test-adapter-level_zero PRIVATE
+        target_include_directories(test-adapter-${adapter} PRIVATE
             ${PROJECT_SOURCE_DIR}/source
-            ${PROJECT_SOURCE_DIR}/source/adapters/level_zero
+            ${PROJECT_SOURCE_DIR}/source/adapters/${adapter}
             LevelZeroLoader-Headers
         )
 
-        add_dependencies(test-adapter-level_zero
+        add_dependencies(test-adapter-${adapter}
             generate_device_binaries kernel_names_header)
     endif()
 
     if(NOT WIN32 AND NOT UR_STATIC_ADAPTER_L0)
-        add_adapter_test(level_zero_ze_calls
-            FIXTURE DEVICES
-            SOURCES
-                event_cache_tests.cpp
-            ENVIRONMENT
-                "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
-                "ZE_ENABLE_TRACING_LAYER=1"
-        )
+        # TODO: adjust to work for v2
+        if("${adapter}" STREQUAL "level_zero")
+            add_adapter_test(${adapter}_ze_calls
+                FIXTURE DEVICES
+                SOURCES
+                    event_cache_tests.cpp
+                ENVIRONMENT
+                    "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_${adapter}>\""
+                    "ZE_ENABLE_TRACING_LAYER=1"
+            )
+
+            target_link_libraries(test-adapter-${adapter}_ze_calls PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
+        endif()
 
-        target_link_libraries(test-adapter-level_zero_ze_calls PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
+        # TODO: enable for v2 once driver issue is fixed
+        if("${adapter}" STREQUAL "level_zero")
+            add_adapter_test(${adapter}_multi_queue
+                FIXTURE DEVICES
+                SOURCES
+                    multi_device_event_cache_tests.cpp
+                ENVIRONMENT
+                    "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_${adapter}>\""
+                    "ZE_ENABLE_TRACING_LAYER=1"
+            )
 
-        add_adapter_test(level_zero_multi_queue
+            target_link_libraries(test-adapter-${adapter}_multi_queue PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
+        endif()
+    endif()
+
+    # TODO: debug for v2
+    if("${adapter}" STREQUAL "level_zero")
+        add_adapter_test(${adapter}_ipc
             FIXTURE DEVICES
             SOURCES
-                multi_device_event_cache_tests.cpp
+                ipc.cpp
             ENVIRONMENT
-                "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
-                "ZE_ENABLE_TRACING_LAYER=1"
+                "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_${adapter}>\""
         )
 
-        target_link_libraries(test-adapter-level_zero_multi_queue PRIVATE LevelZeroLoader LevelZeroLoader-Headers)
+        target_link_libraries(test-adapter-${adapter}_ipc PRIVATE
+            ur_umf
+        )
     endif()
+endfunction()
 
-    add_adapter_test(level_zero_ipc
-        FIXTURE DEVICES
-        SOURCES
-            ipc.cpp
-        ENVIRONMENT
-            "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
-    )
-
-    target_link_libraries(test-adapter-level_zero_ipc PRIVATE
-        ur_umf
-    )
+if(UR_BUILD_ADAPTER_L0)
+    add_adapter_tests(level_zero)
 endif()
 
 if(UR_BUILD_ADAPTER_L0_V2)
+    add_adapter_tests(level_zero_v2)
     add_subdirectory(v2)
 endif()
diff --git a/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp
index 7e667bfe30..1c385933db 100644
--- a/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp
+++ b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp
@@ -10,6 +10,8 @@
 #include <thread>
 #include <uur/fixtures.h>
 
+#include "ze_helpers.hpp"
+
 using namespace std::chrono_literals;
 using urLevelZeroEventNativeHandleTest = uur::urQueueTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest);
@@ -17,33 +19,7 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest);
 #define TEST_MEMCPY_SIZE 4096
 
 TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
-    ze_event_pool_desc_t desc;
-    desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
-    desc.pNext = nullptr;
-    desc.count = 1;
-    desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
-
-    ur_native_handle_t nativeContext;
-    ASSERT_SUCCESS(urContextGetNativeHandle(context, &nativeContext));
-
-    ur_native_handle_t nativeDevice;
-    ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &nativeDevice));
-
-    ze_event_pool_handle_t pool = nullptr;
-
-    ASSERT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1,
-                                (ze_device_handle_t *)&nativeDevice, &pool),
-              ZE_RESULT_SUCCESS);
-
-    ze_event_desc_t eventDesc;
-    eventDesc.pNext = nullptr;
-    eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
-    eventDesc.index = 0;
-    eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
-    eventDesc.wait = 0;
-
-    ze_event_handle_t zeEvent;
-    ASSERT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS);
+    auto zeEvent = createZeEvent(context, device);
 
     ur_event_native_properties_t pprops;
     pprops.isNativeHandleOwned = false;
@@ -51,8 +27,8 @@ TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
     pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES;
 
     ur_event_handle_t urEvent;
-    ASSERT_SUCCESS(urEventCreateWithNativeHandle((ur_native_handle_t)zeEvent,
-                                                 context, &pprops, &urEvent));
+    ASSERT_SUCCESS(urEventCreateWithNativeHandle(
+        (ur_native_handle_t)zeEvent.get(), context, &pprops, &urEvent));
 
     int *src = (int *)malloc(TEST_MEMCPY_SIZE);
     memset(src, 0xc, TEST_MEMCPY_SIZE);
@@ -90,7 +66,7 @@ TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
 
     ASSERT_NE(memcmp(src, dst, TEST_MEMCPY_SIZE), 0);
 
-    zeEventHostSignal(zeEvent);
+    zeEventHostSignal(zeEvent.get());
 
     urQueueFinish(queue);
 
@@ -104,6 +80,32 @@ TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
     urEventRelease(memcpyEvent);
     urEventRelease(memcpyEvent2);
     urEventRelease(memcpyEvent3);
-    zeEventDestroy(zeEvent);
-    zeEventPoolDestroy(pool);
+}
+
+TEST_P(urLevelZeroEventNativeHandleTest, NativeStatusQuery) {
+    auto zeEvent = createZeEvent(context, device);
+
+    ur_event_native_properties_t pprops;
+    pprops.isNativeHandleOwned = false;
+    pprops.pNext = nullptr;
+    pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES;
+
+    ur_event_handle_t urEvent;
+    ASSERT_SUCCESS(urEventCreateWithNativeHandle(
+        (ur_native_handle_t)zeEvent.get(), context, &pprops, &urEvent));
+
+    ur_event_status_t status;
+    ASSERT_SUCCESS(urEventGetInfo(urEvent,
+                                  UR_EVENT_INFO_COMMAND_EXECUTION_STATUS,
+                                  sizeof(ur_event_status_t), &status, nullptr));
+    ASSERT_EQ(status, UR_EVENT_STATUS_SUBMITTED);
+
+    zeEventHostSignal(zeEvent.get());
+
+    ASSERT_SUCCESS(urEventGetInfo(urEvent,
+                                  UR_EVENT_INFO_COMMAND_EXECUTION_STATUS,
+                                  sizeof(ur_event_status_t), &status, nullptr));
+    ASSERT_EQ(status, UR_EVENT_STATUS_COMPLETE);
+
+    urEventRelease(urEvent);
 }
diff --git a/test/adapters/level_zero/v2/event_pool_test.cpp b/test/adapters/level_zero/v2/event_pool_test.cpp
index 1029d471df..97b166c63d 100644
--- a/test/adapters/level_zero/v2/event_pool_test.cpp
+++ b/test/adapters/level_zero/v2/event_pool_test.cpp
@@ -8,6 +8,7 @@
 #include "level_zero/common.hpp"
 #include "level_zero/device.hpp"
 
+#include "../ze_helpers.hpp"
 #include "context.hpp"
 #include "event_pool.hpp"
 #include "event_pool_cache.hpp"
@@ -26,6 +27,14 @@ using namespace v2;
 
 static constexpr size_t MAX_DEVICES = 10;
 
+// mock necessary functions from context, we can't pull in entire context implementation due to
+// a lot of other dependencies
+std::vector<ur_device_handle_t> mockVec{};
+const std::vector<ur_device_handle_t> &
+ur_context_handle_t_::getDevices() const {
+    return mockVec;
+}
+
 enum ProviderType {
     TEST_PROVIDER_NORMAL,
     TEST_PROVIDER_COUNTER,
@@ -92,14 +101,16 @@ printParams(const testing::TestParamInfo<typename T::ParamType> &info) {
     return params_stream.str();
 }
 
-struct EventPoolTest : public uur::urContextTestWithParam<ProviderParams> {
+struct EventPoolTest : public uur::urQueueTestWithParam<ProviderParams> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::SetUp());
 
         auto params = getParam();
 
+        mockVec.push_back(device);
+
         cache = std::unique_ptr<event_pool_cache>(new event_pool_cache(
-            MAX_DEVICES,
+            nullptr, MAX_DEVICES,
             [this, params](DeviceId, event_flags_t flags)
                 -> std::unique_ptr<event_provider> {
                 // normally id would be used to find the appropriate device to create the provider
@@ -109,14 +120,14 @@ struct EventPoolTest : public uur::urContextTestWithParam<ProviderParams> {
                                                               device);
                 case TEST_PROVIDER_NORMAL:
                     return std::make_unique<provider_normal>(
-                        context, device, params.queue, flags);
+                        context, params.queue, flags);
                 }
                 return nullptr;
             }));
     }
     void TearDown() override {
         cache.reset();
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::TearDown());
     }
 
     std::unique_ptr<event_pool_cache> cache;
@@ -150,8 +161,8 @@ TEST_P(EventPoolTest, Basic) {
         {
             auto pool = cache->borrow(device->Id.value(), getParam().flags);
 
-            first = pool->allocate(reinterpret_cast<ur_queue_handle_t>(0x1),
-                                   UR_COMMAND_KERNEL_LAUNCH);
+            first = pool->allocate();
+            first->resetQueueAndCommand(queue, UR_COMMAND_KERNEL_LAUNCH);
             zeFirst = first->getZeEvent();
 
             urEventRelease(first);
@@ -161,8 +172,8 @@ TEST_P(EventPoolTest, Basic) {
         {
             auto pool = cache->borrow(device->Id.value(), getParam().flags);
 
-            second = pool->allocate(reinterpret_cast<ur_queue_handle_t>(0x1),
-                                    UR_COMMAND_KERNEL_LAUNCH);
+            second = pool->allocate();
+            first->resetQueueAndCommand(queue, UR_COMMAND_KERNEL_LAUNCH);
             zeSecond = second->getZeEvent();
 
             urEventRelease(second);
@@ -181,9 +192,9 @@ TEST_P(EventPoolTest, Threaded) {
                 auto pool = cache->borrow(device->Id.value(), getParam().flags);
                 std::vector<ur_event_handle_t> events;
                 for (int i = 0; i < 100; ++i) {
-                    events.push_back(
-                        pool->allocate(reinterpret_cast<ur_queue_handle_t>(0x1),
-                                       UR_COMMAND_KERNEL_LAUNCH));
+                    events.push_back(pool->allocate());
+                    events.back()->resetQueueAndCommand(
+                        queue, UR_COMMAND_KERNEL_LAUNCH);
                 }
                 for (int i = 0; i < 100; ++i) {
                     urEventRelease(events[i]);
@@ -201,9 +212,9 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) {
     auto pool = cache->borrow(device->Id.value(), getParam().flags);
     std::list<ur_event_handle_t> events;
     for (int i = 0; i < 128; ++i) {
-        events.push_back(
-            pool->allocate(reinterpret_cast<ur_queue_handle_t>(0x1),
-                           UR_COMMAND_KERNEL_LAUNCH));
+        auto event = pool->allocate();
+        event->resetQueueAndCommand(queue, UR_COMMAND_KERNEL_LAUNCH);
+        events.push_back(event);
     }
     auto frontZeHandle = events.front()->getZeEvent();
     for (int i = 0; i < 8; ++i) {
@@ -211,8 +222,8 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) {
         events.pop_front();
     }
     for (int i = 0; i < 8; ++i) {
-        auto e = pool->allocate(reinterpret_cast<ur_queue_handle_t>(0x1),
-                                UR_COMMAND_KERNEL_LAUNCH);
+        auto e = pool->allocate();
+        e->resetQueueAndCommand(queue, UR_COMMAND_KERNEL_LAUNCH);
         events.push_back(e);
     }
 
@@ -223,3 +234,71 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) {
         urEventRelease(e);
     }
 }
+
+using EventPoolTestWithQueue = uur::urQueueTestWithParam<ProviderParams>;
+
+UUR_TEST_SUITE_P(EventPoolTestWithQueue, testing::ValuesIn(test_cases),
+                 printParams<EventPoolTest>);
+
+// TODO: actual min version is unknown, retest after drivers on CI are
+// updated.
+std::tuple<size_t, size_t, size_t> minL0DriverVersion = {1, 6, 31294};
+
+TEST_P(EventPoolTestWithQueue, WithTimestamp) {
+    // Skip due to driver bug causing a sigbus
+    SKIP_IF_DRIVER_TOO_OLD("Level-Zero", minL0DriverVersion, platform, device);
+
+    if (!(getParam().flags & EVENT_FLAGS_PROFILING_ENABLED)) {
+        GTEST_SKIP() << "Profiling needs to be enabled";
+    }
+
+    auto zeEvent = createZeEvent(context, device);
+
+    ur_event_handle_t hEvent;
+    ASSERT_SUCCESS(urEventCreateWithNativeHandle(
+        reinterpret_cast<ur_native_handle_t>(zeEvent.get()), context, nullptr,
+        &hEvent));
+
+    ur_device_handle_t hDevice;
+    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_DEVICE, sizeof(device),
+                                  &hDevice, nullptr));
+
+    ur_event_handle_t first;
+    ze_event_handle_t zeFirst;
+    {
+        ASSERT_SUCCESS(
+            urEnqueueTimestampRecordingExp(queue, false, 1, &hEvent, &first));
+        zeFirst = first->getZeEvent();
+
+        urEventRelease(
+            first); // should not actually release the event until recording is completed
+    }
+    ur_event_handle_t second;
+    ze_event_handle_t zeSecond;
+    {
+        ASSERT_SUCCESS(
+            urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &second));
+        zeSecond = second->getZeEvent();
+        ASSERT_SUCCESS(urEventRelease(second));
+    }
+    ASSERT_NE(first, second);
+    ASSERT_NE(zeFirst, zeSecond);
+
+    ASSERT_EQ(zeEventHostSignal(zeEvent.get()), ZE_RESULT_SUCCESS);
+
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Now, the first event should be avilable for reuse
+    ur_event_handle_t third;
+    ze_event_handle_t zeThird;
+    {
+        ASSERT_SUCCESS(
+            urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &third));
+        zeThird = third->getZeEvent();
+        ASSERT_SUCCESS(urEventRelease(third));
+
+        ASSERT_FALSE(third->isTimestamped());
+    }
+    ASSERT_EQ(first, third);
+    ASSERT_EQ(zeFirst, zeThird);
+}
diff --git a/test/adapters/level_zero/ze_helpers.hpp b/test/adapters/level_zero/ze_helpers.hpp
new file mode 100644
index 0000000000..3943782283
--- /dev/null
+++ b/test/adapters/level_zero/ze_helpers.hpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "ze_api.h"
+#include <functional>
+#include <memory>
+#include <ur_api.h>
+#include <uur/fixtures.h>
+
+std::unique_ptr<_ze_event_handle_t, std::function<void(ze_event_handle_t)>>
+createZeEvent(ur_context_handle_t hContext, ur_device_handle_t hDevice) {
+    ze_event_pool_desc_t desc;
+    desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
+    desc.pNext = nullptr;
+    desc.count = 1;
+    desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+
+    ur_native_handle_t nativeContext;
+    EXPECT_SUCCESS(urContextGetNativeHandle(hContext, &nativeContext));
+
+    ur_native_handle_t nativeDevice;
+    EXPECT_SUCCESS(urDeviceGetNativeHandle(hDevice, &nativeDevice));
+
+    ze_event_pool_handle_t pool = nullptr;
+
+    EXPECT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1,
+                                (ze_device_handle_t *)&nativeDevice, &pool),
+              ZE_RESULT_SUCCESS);
+
+    ze_event_desc_t eventDesc;
+    eventDesc.pNext = nullptr;
+    eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
+    eventDesc.index = 0;
+    eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    eventDesc.wait = 0;
+
+    ze_event_handle_t zeEvent;
+    EXPECT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS);
+
+    return std::unique_ptr<_ze_event_handle_t,
+                           std::function<void(ze_event_handle_t)>>(
+        zeEvent, [pool](ze_event_handle_t zeEvent) {
+            zeEventDestroy(zeEvent);
+            zeEventPoolDestroy(pool);
+        });
+}
\ No newline at end of file

From 8c769174ea18c45b9a8729002c9fbb176153411e Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Thu, 21 Nov 2024 01:13:59 +0000
Subject: [PATCH 108/148] [CTS] fix
 urEnqueueEventsWaitMultiDeviceTest.EnqueueWaitOnADifferentQueue

Wait should be done on a differnt queue as the name suggests.
---
 test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp b/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp
index 1e281b0632..80da446814 100644
--- a/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp
+++ b/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp
@@ -92,7 +92,7 @@ TEST_F(urEnqueueEventsWaitMultiDeviceTest, EnqueueWaitOnADifferentQueue) {
     uur::raii::Event event;
     ASSERT_SUCCESS(urEnqueueUSMMemcpy(queues[0], false, ptrs[1], ptrs[0], size,
                                       0, nullptr, event.ptr()));
-    ASSERT_SUCCESS(urEnqueueEventsWait(queues[0], 1, event.ptr(), nullptr));
+    ASSERT_SUCCESS(urEnqueueEventsWait(queues[1], 1, event.ptr(), nullptr));
     ASSERT_SUCCESS(urQueueFinish(queues[0]));
 
     verifyData(ptrs[1], pattern);

From fbd879482c8bce659e98e1448a2c134e3bcc8b18 Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Thu, 12 Dec 2024 00:29:32 +0000
Subject: [PATCH 109/148] [L0] Fixed event leak when outevent is given and is
 completed

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index c1e93483b8..a0b3dcd328 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -145,6 +145,10 @@ ur_result_t urEnqueueEventsWait(
     std::unique_lock<ur_shared_mutex> Lock(Queue->Mutex);
     resetCommandLists(Queue);
   }
+  if (OutEvent && (*OutEvent)->Completed) {
+    UR_CALL(CleanupCompletedEvent((*OutEvent), false, false));
+    UR_CALL(urEventReleaseInternal((*OutEvent)));
+  }
 
   return UR_RESULT_SUCCESS;
 }
@@ -955,7 +959,6 @@ ur_result_t urEventCreateWithNativeHandle(
     UREvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */,
                                      Context, UR_EXT_COMMAND_TYPE_USER,
                                      Properties->isNativeHandleOwned);
-
     UREvent->RefCountExternal++;
 
   } catch (const std::bad_alloc &) {
@@ -1111,6 +1114,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   // enabled or not, so we access properties of the queue and that's why queue
   // must released later.
   if (DisableEventsCaching || !Event->OwnNativeHandle) {
+    ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
     delete Event;
   } else {
     Event->Context->addEventToContextCache(Event);

From 800b452d67c5d8ba8c398bd02bc5f9e0eb0dc774 Mon Sep 17 00:00:00 2001
From: Martin Morrison-Grant <martin.morrisongrant@codeplay.com>
Date: Wed, 30 Oct 2024 16:54:42 +0000
Subject: [PATCH 110/148] Improvements to align CTS and Spec for Virtual
 Memory:

- UR_RESULT_ERROR_INVALID_ENUMERATION test for urVirtualMemSetAccess
- Add test for urVirtualMemMap with different access flags
- Add test for urVirtualMemSetAccess with different access flags
- InvalidEnumeration test for urPhysicalMemCreate and changed some tests to only run once - different size values were not needed for these
- Add test for urPhysicalMemCreate with different flags (only one placeholder is available, this is just a fixture for future additions)
- Add new urPhysicalMemGetInfo entry point along with a reference count enum. Added adapter implementations for this
- Add a test for urPhysicalMemGetInfo and modified urPhysicalMemRelease/Retain to use GetInfo to verify reference count is updated accordingly
---
 include/ur_api.h                              |  59 ++++++-
 include/ur_api_funcs.def                      |   1 +
 include/ur_ddi.h                              |  10 ++
 include/ur_print.h                            |  16 ++
 include/ur_print.hpp                          | 151 ++++++++++++++++++
 scripts/core/memory.yml                       |   2 +-
 scripts/core/registry.yml                     |   3 +
 scripts/core/virtual_memory.yml               |  49 ++++++
 source/adapters/cuda/physical_mem.cpp         |  32 +++-
 source/adapters/cuda/physical_mem.hpp         |  14 +-
 source/adapters/cuda/ur_interface_loader.cpp  |   1 +
 source/adapters/hip/physical_mem.cpp          |   6 +
 source/adapters/hip/ur_interface_loader.cpp   |   1 +
 source/adapters/level_zero/physical_mem.cpp   |  17 ++
 .../level_zero/ur_interface_loader.cpp        |   1 +
 .../level_zero/ur_interface_loader.hpp        |   4 +
 source/adapters/level_zero/v2/api.cpp         |   7 +
 source/adapters/mock/ur_mockddi.cpp           |  56 +++++++
 source/adapters/native_cpu/physical_mem.cpp   |   6 +
 .../native_cpu/ur_interface_loader.cpp        |   1 +
 source/adapters/opencl/physical_mem.cpp       |   6 +
 .../adapters/opencl/ur_interface_loader.cpp   |   1 +
 source/loader/layers/tracing/ur_trcddi.cpp    |  51 ++++++
 source/loader/layers/validation/ur_valddi.cpp |  46 ++++++
 source/loader/loader.def.in                   |   3 +
 source/loader/loader.map.in                   |   3 +
 source/loader/ur_ldrddi.cpp                   |  85 ++++++++++
 source/loader/ur_libapi.cpp                   |  37 +++++
 source/loader/ur_print.cpp                    |  16 ++
 source/ur_api.cpp                             |  30 ++++
 .../testing/include/uur/fixtures.h            |  15 +-
 .../conformance/virtual_memory/CMakeLists.txt |   1 +
 .../virtual_memory/urPhysicalMemCreate.cpp    |  56 +++++--
 .../virtual_memory/urPhysicalMemGetInfo.cpp   |  93 +++++++++++
 .../virtual_memory/urPhysicalMemRelease.cpp   |  15 ++
 .../virtual_memory/urPhysicalMemRetain.cpp    |  15 ++
 .../virtual_memory/urVirtualMemMap.cpp        |  16 +-
 .../virtual_memory/urVirtualMemSetAccess.cpp  |  31 +++-
 .../virtual_memory_adapter_level_zero.match   |  11 +-
 ...virtual_memory_adapter_level_zero_v2.match |  16 +-
 40 files changed, 941 insertions(+), 43 deletions(-)
 create mode 100644 test/conformance/virtual_memory/urPhysicalMemGetInfo.cpp

diff --git a/include/ur_api.h b/include/ur_api.h
index 7922b53d6c..36e6c29e68 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -233,6 +233,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT = 246,               ///< Enumerator for ::urEnqueueEventsWaitWithBarrierExt
     UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 247,                     ///< Enumerator for ::urTensorMapEncodeIm2ColExp
     UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 248,                        ///< Enumerator for ::urTensorMapEncodeTiledExp
+    UR_FUNCTION_PHYSICAL_MEM_GET_INFO = 249,                              ///< Enumerator for ::urPhysicalMemGetInfo
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -2525,7 +2526,7 @@ typedef enum ur_mem_type_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Memory Information type
 typedef enum ur_mem_info_t {
-    UR_MEM_INFO_SIZE = 0,            ///< [size_t] actual size of of memory object in bytes
+    UR_MEM_INFO_SIZE = 0,            ///< [size_t] actual size of the memory object in bytes
     UR_MEM_INFO_CONTEXT = 1,         ///< [::ur_context_handle_t] context in which the memory object was created
     UR_MEM_INFO_REFERENCE_COUNT = 2, ///< [uint32_t] Reference count of the memory object.
                                      ///< The reference count returned should be considered immediately stale.
@@ -4138,6 +4139,50 @@ urPhysicalMemRelease(
     ur_physical_mem_handle_t hPhysicalMem ///< [in][release] handle of the physical memory object to release.
 );
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Physical memory range info queries.
+typedef enum ur_physical_mem_info_t {
+    UR_PHYSICAL_MEM_INFO_CONTEXT = 0,         ///< [::ur_context_handle_t] context in which the physical memory object
+                                              ///< was created.
+    UR_PHYSICAL_MEM_INFO_DEVICE = 1,          ///< [::ur_device_handle_t] device associated with this physical memory
+                                              ///< object.
+    UR_PHYSICAL_MEM_INFO_SIZE = 2,            ///< [size_t] actual size of the physical memory object in bytes.
+    UR_PHYSICAL_MEM_INFO_PROPERTIES = 3,      ///< [::ur_physical_mem_properties_t] properties set when creating this
+                                              ///< physical memory object.
+    UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT = 4, ///< [uint32_t] Reference count of the physical memory object.
+                                              ///< The reference count returned should be considered immediately stale.
+                                              ///< It is unsuitable for general use in applications. This feature is
+                                              ///< provided for identifying memory leaks.
+    /// @cond
+    UR_PHYSICAL_MEM_INFO_FORCE_UINT32 = 0x7fffffff
+    /// @endcond
+
+} ur_physical_mem_info_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about a physical memory object.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hPhysicalMem`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT < propName`
+UR_APIEXPORT ur_result_t UR_APICALL
+urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName,       ///< [in] type of the info to query.
+    size_t propSize,                       ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *pPropValue,                      ///< [out][optional][typename(propName, propSize)] array of bytes holding
+                                           ///< the info. If propSize is less than the real number of bytes needed to
+                                           ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+                                           ///< returned and pPropValue is not used.
+    size_t *pPropSizeRet                   ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -11317,6 +11362,18 @@ typedef struct ur_physical_mem_release_params_t {
     ur_physical_mem_handle_t *phPhysicalMem;
 } ur_physical_mem_release_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urPhysicalMemGetInfo
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_physical_mem_get_info_params_t {
+    ur_physical_mem_handle_t *phPhysicalMem;
+    ur_physical_mem_info_t *ppropName;
+    size_t *ppropSize;
+    void **ppPropValue;
+    size_t **ppPropSizeRet;
+} ur_physical_mem_get_info_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urAdapterGet
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def
index 5279534547..96d8e0caf4 100644
--- a/include/ur_api_funcs.def
+++ b/include/ur_api_funcs.def
@@ -96,6 +96,7 @@ _UR_API(urMemImageGetInfo)
 _UR_API(urPhysicalMemCreate)
 _UR_API(urPhysicalMemRetain)
 _UR_API(urPhysicalMemRelease)
+_UR_API(urPhysicalMemGetInfo)
 _UR_API(urAdapterGet)
 _UR_API(urAdapterRelease)
 _UR_API(urAdapterRetain)
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index ce7dd137a9..eeb323fc58 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -978,12 +978,22 @@ typedef ur_result_t(UR_APICALL *ur_pfnPhysicalMemRetain_t)(
 typedef ur_result_t(UR_APICALL *ur_pfnPhysicalMemRelease_t)(
     ur_physical_mem_handle_t);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urPhysicalMemGetInfo
+typedef ur_result_t(UR_APICALL *ur_pfnPhysicalMemGetInfo_t)(
+    ur_physical_mem_handle_t,
+    ur_physical_mem_info_t,
+    size_t,
+    void *,
+    size_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Table of PhysicalMem functions pointers
 typedef struct ur_physical_mem_dditable_t {
     ur_pfnPhysicalMemCreate_t pfnCreate;
     ur_pfnPhysicalMemRetain_t pfnRetain;
     ur_pfnPhysicalMemRelease_t pfnRelease;
+    ur_pfnPhysicalMemGetInfo_t pfnGetInfo;
 } ur_physical_mem_dditable_t;
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/ur_print.h b/include/ur_print.h
index 3782ffb5ce..9bec0bc7b5 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -594,6 +594,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemFlags(enum ur_physical_mem
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemProperties(const struct ur_physical_mem_properties_t params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_physical_mem_info_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemInfo(enum ur_physical_mem_info_t value, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_program_metadata_type_t enum
 /// @returns
@@ -1850,6 +1858,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemRetainParams(const struct
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemReleaseParams(const struct ur_physical_mem_release_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_physical_mem_get_info_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintPhysicalMemGetInfoParams(const struct ur_physical_mem_get_info_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_adapter_get_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8284731dc1..5255a20f78 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -155,6 +155,9 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_virtual_mem
 template <>
 inline ur_result_t printFlag<ur_physical_mem_flag_t>(std::ostream &os, uint32_t flag);
 
+template <>
+inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_physical_mem_info_t value, size_t size);
+
 inline ur_result_t printUnion(
     std::ostream &os,
     const union ur_program_metadata_value_t params,
@@ -313,6 +316,7 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_virtual_mem_access_fla
 inline std::ostream &operator<<(std::ostream &os, enum ur_virtual_mem_info_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_physical_mem_flag_t value);
 inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_physical_mem_properties_t params);
+inline std::ostream &operator<<(std::ostream &os, enum ur_physical_mem_info_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ur_program_metadata_type_t value);
 inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_program_metadata_t params);
 inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_program_properties_t params);
@@ -994,6 +998,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP:
         os << "UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP";
         break;
+    case UR_FUNCTION_PHYSICAL_MEM_GET_INFO:
+        os << "UR_FUNCTION_PHYSICAL_MEM_GET_INFO";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -7498,6 +7505,113 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_physical_mem_p
     os << "}";
     return os;
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_physical_mem_info_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ur_physical_mem_info_t value) {
+    switch (value) {
+    case UR_PHYSICAL_MEM_INFO_CONTEXT:
+        os << "UR_PHYSICAL_MEM_INFO_CONTEXT";
+        break;
+    case UR_PHYSICAL_MEM_INFO_DEVICE:
+        os << "UR_PHYSICAL_MEM_INFO_DEVICE";
+        break;
+    case UR_PHYSICAL_MEM_INFO_SIZE:
+        os << "UR_PHYSICAL_MEM_INFO_SIZE";
+        break;
+    case UR_PHYSICAL_MEM_INFO_PROPERTIES:
+        os << "UR_PHYSICAL_MEM_INFO_PROPERTIES";
+        break;
+    case UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT:
+        os << "UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT";
+        break;
+    default:
+        os << "unknown enumerator";
+        break;
+    }
+    return os;
+}
+namespace ur::details {
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_physical_mem_info_t enum value
+template <>
+inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_physical_mem_info_t value, size_t size) {
+    if (ptr == NULL) {
+        return printPtr(os, ptr);
+    }
+
+    switch (value) {
+    case UR_PHYSICAL_MEM_INFO_CONTEXT: {
+        const ur_context_handle_t *tptr = (const ur_context_handle_t *)ptr;
+        if (sizeof(ur_context_handle_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_context_handle_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        ur::details::printPtr(os,
+                              *tptr);
+
+        os << ")";
+    } break;
+    case UR_PHYSICAL_MEM_INFO_DEVICE: {
+        const ur_device_handle_t *tptr = (const ur_device_handle_t *)ptr;
+        if (sizeof(ur_device_handle_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_device_handle_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        ur::details::printPtr(os,
+                              *tptr);
+
+        os << ")";
+    } break;
+    case UR_PHYSICAL_MEM_INFO_SIZE: {
+        const size_t *tptr = (const size_t *)ptr;
+        if (sizeof(size_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_PHYSICAL_MEM_INFO_PROPERTIES: {
+        const ur_physical_mem_properties_t *tptr = (const ur_physical_mem_properties_t *)ptr;
+        if (sizeof(ur_physical_mem_properties_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_physical_mem_properties_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT: {
+        const uint32_t *tptr = (const uint32_t *)ptr;
+        if (sizeof(uint32_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(uint32_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    default:
+        os << "unknown enumerator";
+        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+    return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_program_metadata_type_t type
 /// @returns
@@ -13805,6 +13919,40 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_physical_mem_get_info_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_physical_mem_get_info_params_t *params) {
+
+    os << ".hPhysicalMem = ";
+
+    ur::details::printPtr(os,
+                          *(params->phPhysicalMem));
+
+    os << ", ";
+    os << ".propName = ";
+
+    os << *(params->ppropName);
+
+    os << ", ";
+    os << ".propSize = ";
+
+    os << *(params->ppropSize);
+
+    os << ", ";
+    os << ".pPropValue = ";
+    ur::details::printTagged(os, *(params->ppPropValue), *(params->ppropName), *(params->ppropSize));
+
+    os << ", ";
+    os << ".pPropSizeRet = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppPropSizeRet));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_adapter_get_params_t type
 /// @returns
@@ -19652,6 +19800,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_PHYSICAL_MEM_RELEASE: {
         os << (const struct ur_physical_mem_release_params_t *)params;
     } break;
+    case UR_FUNCTION_PHYSICAL_MEM_GET_INFO: {
+        os << (const struct ur_physical_mem_get_info_params_t *)params;
+    } break;
     case UR_FUNCTION_ADAPTER_GET: {
         os << (const struct ur_adapter_get_params_t *)params;
     } break;
diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml
index 7cc7467da4..467e10d749 100644
--- a/scripts/core/memory.yml
+++ b/scripts/core/memory.yml
@@ -59,7 +59,7 @@ name: $x_mem_info_t
 typed_etors: True
 etors:
     - name: SIZE
-      desc: "[size_t] actual size of of memory object in bytes"
+      desc: "[size_t] actual size of the memory object in bytes"
     - name: CONTEXT
       desc: "[$x_context_handle_t] context in which the memory object was created"
     - name: REFERENCE_COUNT
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index f1a5d9199f..2ae34fb0b4 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -613,6 +613,9 @@ etors:
 - name: TENSOR_MAP_ENCODE_TILED_EXP
   desc: Enumerator for $xTensorMapEncodeTiledExp
   value: '248'
+- name: PHYSICAL_MEM_GET_INFO
+  desc: Enumerator for $xPhysicalMemGetInfo
+  value: '249'
 ---
 type: enum
 desc: Defines structure types
diff --git a/scripts/core/virtual_memory.yml b/scripts/core/virtual_memory.yml
index 133266de64..a6b580a5a9 100644
--- a/scripts/core/virtual_memory.yml
+++ b/scripts/core/virtual_memory.yml
@@ -303,3 +303,52 @@ params:
     - type: $x_physical_mem_handle_t
       name: hPhysicalMem
       desc: "[in][release] handle of the physical memory object to release."
+
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "Physical memory range info queries."
+class: $xPhysicalMem
+name: $x_physical_mem_info_t
+typed_etors: True
+etors:
+    - name: CONTEXT
+      desc: "[$x_context_handle_t] context in which the physical memory object was created."
+    - name: DEVICE
+      desc: "[$x_device_handle_t] device associated with this physical memory object."
+    - name: SIZE
+      desc: "[size_t] actual size of the physical memory object in bytes."
+    - name: PROPERTIES
+      desc: "[$x_physical_mem_properties_t] properties set when creating this physical memory object."
+    - name: REFERENCE_COUNT
+      desc: |
+            [uint32_t] Reference count of the physical memory object.
+            The reference count returned should be considered immediately stale.
+            It is unsuitable for general use in applications. This feature is provided for identifying memory leaks.
+
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Get information about a physical memory object."
+class: $xPhysicalMem
+name: GetInfo
+params:
+    - type: $x_physical_mem_handle_t
+      name: hPhysicalMem
+      desc: "[in] handle of the physical memory object to query."
+    - type: $x_physical_mem_info_t
+      name: propName
+      desc: "[in] type of the info to query."
+    - type: size_t
+      name: propSize
+      desc: "[in] size in bytes of the memory pointed to by pPropValue."
+    - type: void*
+      name: pPropValue
+      desc: >
+          [out][optional][typename(propName, propSize)] array of bytes holding
+          the info. If propSize is less than the real number of bytes needed to
+          return the info then the $X_RESULT_ERROR_INVALID_SIZE error is
+          returned and pPropValue is not used.
+    - type: size_t*
+      name: pPropSizeRet
+      desc: >
+          [out][optional] pointer to the actual size in bytes of the queried
+          propName."
diff --git a/source/adapters/cuda/physical_mem.cpp b/source/adapters/cuda/physical_mem.cpp
index 28b9312176..71bf596acb 100644
--- a/source/adapters/cuda/physical_mem.cpp
+++ b/source/adapters/cuda/physical_mem.cpp
@@ -33,8 +33,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
     UR_CHECK_ERROR(Result);
   }
   try {
-    *phPhysicalMem =
-        new ur_physical_mem_handle_t_(ResHandle, hContext, hDevice);
+    *phPhysicalMem = new ur_physical_mem_handle_t_(
+        ResHandle, hContext, hDevice, size,
+        pProperties ? *pProperties : ur_physical_mem_properties_t{});
   } catch (std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
@@ -66,3 +67,30 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
   }
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t hPhysicalMem, ur_physical_mem_info_t propName,
+    size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PHYSICAL_MEM_INFO_CONTEXT: {
+    return ReturnValue(hPhysicalMem->getContext());
+  }
+  case UR_PHYSICAL_MEM_INFO_DEVICE: {
+    return ReturnValue(hPhysicalMem->getDevice());
+  }
+  case UR_PHYSICAL_MEM_INFO_SIZE: {
+    return ReturnValue(hPhysicalMem->getSize());
+  }
+  case UR_PHYSICAL_MEM_INFO_PROPERTIES: {
+    return ReturnValue(hPhysicalMem->getProperties());
+  }
+  case UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT: {
+    return ReturnValue(hPhysicalMem->getReferenceCount());
+  }
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+}
diff --git a/source/adapters/cuda/physical_mem.hpp b/source/adapters/cuda/physical_mem.hpp
index c77ad0d547..7e38e1be9e 100644
--- a/source/adapters/cuda/physical_mem.hpp
+++ b/source/adapters/cuda/physical_mem.hpp
@@ -27,10 +27,14 @@ struct ur_physical_mem_handle_t_ {
   native_type PhysicalMem;
   ur_context_handle_t_ *Context;
   ur_device_handle_t Device;
+  size_t Size;
+  ur_physical_mem_properties_t Properties;
 
   ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx,
-                            ur_device_handle_t Device)
-      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device) {
+                            ur_device_handle_t Device, size_t Size,
+                            ur_physical_mem_properties_t Properties)
+      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device),
+        Size(Size), Properties(Properties) {
     urContextRetain(Context);
     urDeviceRetain(Device);
   }
@@ -51,4 +55,10 @@ struct ur_physical_mem_handle_t_ {
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  size_t getSize() const noexcept { return Size; }
+
+  ur_physical_mem_properties_t getProperties() const noexcept {
+    return Properties;
+  }
 };
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index cea4707a05..ad0d775be0 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -401,6 +401,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
   pDdiTable->pfnCreate = urPhysicalMemCreate;
   pDdiTable->pfnRelease = urPhysicalMemRelease;
   pDdiTable->pfnRetain = urPhysicalMemRetain;
+  pDdiTable->pfnGetInfo = urPhysicalMemGetInfo;
 
   return retVal;
 }
diff --git a/source/adapters/hip/physical_mem.cpp b/source/adapters/hip/physical_mem.cpp
index f0003b6c00..87fe716d48 100644
--- a/source/adapters/hip/physical_mem.cpp
+++ b/source/adapters/hip/physical_mem.cpp
@@ -28,3 +28,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urPhysicalMemRelease(ur_physical_mem_handle_t) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPhysicalMemGetInfo(ur_physical_mem_handle_t, ur_physical_mem_info_t, size_t,
+                     void *, size_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 2c9df55bb6..42ca36eb77 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -368,6 +368,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
   pDdiTable->pfnCreate = urPhysicalMemCreate;
   pDdiTable->pfnRelease = urPhysicalMemRelease;
   pDdiTable->pfnRetain = urPhysicalMemRetain;
+  pDdiTable->pfnGetInfo = urPhysicalMemGetInfo;
 
   return retVal;
 }
diff --git a/source/adapters/level_zero/physical_mem.cpp b/source/adapters/level_zero/physical_mem.cpp
index e7bb498859..e28b876905 100644
--- a/source/adapters/level_zero/physical_mem.cpp
+++ b/source/adapters/level_zero/physical_mem.cpp
@@ -52,4 +52,21 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
 
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t hPhysicalMem, ur_physical_mem_info_t propName,
+    size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT: {
+    return ReturnValue(hPhysicalMem->RefCount.load());
+  }
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 } // namespace ur::level_zero
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index 1d9c8d5c37..1419c8a606 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -320,6 +320,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
   pDdiTable->pfnCreate = ur::level_zero::urPhysicalMemCreate;
   pDdiTable->pfnRetain = ur::level_zero::urPhysicalMemRetain;
   pDdiTable->pfnRelease = ur::level_zero::urPhysicalMemRelease;
+  pDdiTable->pfnGetInfo = ur::level_zero::urPhysicalMemGetInfo;
 
   return result;
 }
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index ed7e97fa3b..5bd7c904f1 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -182,6 +182,10 @@ ur_result_t urPhysicalMemCreate(ur_context_handle_t hContext,
                                 ur_physical_mem_handle_t *phPhysicalMem);
 ur_result_t urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem);
 ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem);
+ur_result_t urPhysicalMemGetInfo(ur_physical_mem_handle_t hPhysicalMem,
+                                 ur_physical_mem_info_t propName,
+                                 size_t propSize, void *pPropValue,
+                                 size_t *pPropSizeRet);
 ur_result_t urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
                                   size_t length,
                                   const ur_program_properties_t *pProperties,
diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index 593115a99f..6a9264dcf2 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -156,6 +156,13 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t hPhysicalMem, ur_physical_mem_info_t propName,
+    size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
+  logger::error("{} function not implemented!", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 ur_result_t
 urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
                       const ur_kernel_arg_sampler_properties_t *pProperties,
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index baacaacb19..b60be1d561 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -3103,6 +3103,60 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urPhysicalMemGetInfo
+__urdlllocal ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    ur_physical_mem_get_info_params_t params = {
+        &hPhysicalMem, &propName, &propSize, &pPropValue, &pPropSizeRet};
+
+    auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_before_callback("urPhysicalMemGetInfo"));
+    if (beforeCallback) {
+        result = beforeCallback(&params);
+        if (result != UR_RESULT_SUCCESS) {
+            return result;
+        }
+    }
+
+    auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_replace_callback("urPhysicalMemGetInfo"));
+    if (replaceCallback) {
+        result = replaceCallback(&params);
+    } else {
+
+        result = UR_RESULT_SUCCESS;
+    }
+
+    if (result != UR_RESULT_SUCCESS) {
+        return result;
+    }
+
+    auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+        mock::getCallbacks().get_after_callback("urPhysicalMemGetInfo"));
+    if (afterCallback) {
+        return afterCallback(&params);
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramCreateWithIL
 __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL(
@@ -11509,6 +11563,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
 
     pDdiTable->pfnRelease = driver::urPhysicalMemRelease;
 
+    pDdiTable->pfnGetInfo = driver::urPhysicalMemGetInfo;
+
     return result;
 } catch (...) {
     return exceptionToResult(std::current_exception());
diff --git a/source/adapters/native_cpu/physical_mem.cpp b/source/adapters/native_cpu/physical_mem.cpp
index 7c535bfcca..a5a1c6411c 100644
--- a/source/adapters/native_cpu/physical_mem.cpp
+++ b/source/adapters/native_cpu/physical_mem.cpp
@@ -27,3 +27,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urPhysicalMemRelease(ur_physical_mem_handle_t) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPhysicalMemGetInfo(ur_physical_mem_handle_t, ur_physical_mem_info_t, size_t,
+                     void *, size_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 55b1e6a568..a913aa4c4e 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -357,6 +357,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
   pDdiTable->pfnCreate = urPhysicalMemCreate;
   pDdiTable->pfnRelease = urPhysicalMemRelease;
   pDdiTable->pfnRetain = urPhysicalMemRetain;
+  pDdiTable->pfnGetInfo = urPhysicalMemGetInfo;
 
   return retVal;
 }
diff --git a/source/adapters/opencl/physical_mem.cpp b/source/adapters/opencl/physical_mem.cpp
index 9fffd0f979..791804f0ac 100644
--- a/source/adapters/opencl/physical_mem.cpp
+++ b/source/adapters/opencl/physical_mem.cpp
@@ -27,3 +27,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urPhysicalMemRelease(ur_physical_mem_handle_t) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPhysicalMemGetInfo(ur_physical_mem_handle_t, ur_physical_mem_info_t, size_t,
+                     void *, size_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index d51c27f6cc..7a3845a4d9 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -394,6 +394,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
   pDdiTable->pfnCreate = urPhysicalMemCreate;
   pDdiTable->pfnRelease = urPhysicalMemRelease;
   pDdiTable->pfnRetain = urPhysicalMemRetain;
+  pDdiTable->pfnGetInfo = urPhysicalMemGetInfo;
 
   return retVal;
 }
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index bd620a159f..55f8d00bea 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -2603,6 +2603,54 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urPhysicalMemGetInfo
+__urdlllocal ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+) {
+    auto pfnGetInfo = getContext()->urDdiTable.PhysicalMem.pfnGetInfo;
+
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_physical_mem_get_info_params_t params = {
+        &hPhysicalMem, &propName, &propSize, &pPropValue, &pPropSizeRet};
+    uint64_t instance = getContext()->notify_begin(
+        UR_FUNCTION_PHYSICAL_MEM_GET_INFO, "urPhysicalMemGetInfo", &params);
+
+    auto &logger = getContext()->logger;
+    logger.info("   ---> urPhysicalMemGetInfo\n");
+
+    ur_result_t result =
+        pfnGetInfo(hPhysicalMem, propName, propSize, pPropValue, pPropSizeRet);
+
+    getContext()->notify_end(UR_FUNCTION_PHYSICAL_MEM_GET_INFO,
+                             "urPhysicalMemGetInfo", &params, &result,
+                             instance);
+
+    if (logger.getLevel() <= logger::Level::INFO) {
+        std::ostringstream args_str;
+        ur::extras::printFunctionParams(
+            args_str, UR_FUNCTION_PHYSICAL_MEM_GET_INFO, &params);
+        logger.info("   <--- urPhysicalMemGetInfo({}) -> {};\n", args_str.str(),
+                    result);
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramCreateWithIL
 __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL(
@@ -10166,6 +10214,9 @@ __urdlllocal ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
     dditable.pfnRelease = pDdiTable->pfnRelease;
     pDdiTable->pfnRelease = ur_tracing_layer::urPhysicalMemRelease;
 
+    dditable.pfnGetInfo = pDdiTable->pfnGetInfo;
+    pDdiTable->pfnGetInfo = ur_tracing_layer::urPhysicalMemGetInfo;
+
     return result;
 }
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b29618ae7a..6e48f79edc 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -2665,6 +2665,49 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urPhysicalMemGetInfo
+__urdlllocal ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+) {
+    auto pfnGetInfo = getContext()->urDdiTable.PhysicalMem.pfnGetInfo;
+
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (getContext()->enableParameterValidation) {
+        if (NULL == hPhysicalMem) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT < propName) {
+            return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        }
+    }
+
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hPhysicalMem)) {
+        getContext()->refCountContext->logInvalidReference(hPhysicalMem);
+    }
+
+    ur_result_t result =
+        pfnGetInfo(hPhysicalMem, propName, propSize, pPropValue, pPropSizeRet);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramCreateWithIL
 __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL(
@@ -11300,6 +11343,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
     dditable.pfnRelease = pDdiTable->pfnRelease;
     pDdiTable->pfnRelease = ur_validation_layer::urPhysicalMemRelease;
 
+    dditable.pfnGetInfo = pDdiTable->pfnGetInfo;
+    pDdiTable->pfnGetInfo = ur_validation_layer::urPhysicalMemGetInfo;
+
     return result;
 }
 
diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in
index 5ca6d99113..a3a18a4170 100644
--- a/source/loader/loader.def.in
+++ b/source/loader/loader.def.in
@@ -161,6 +161,7 @@ EXPORTS
 	urMemRelease
 	urMemRetain
 	urPhysicalMemCreate
+	urPhysicalMemGetInfo
 	urPhysicalMemRelease
 	urPhysicalMemRetain
 	urPlatformCreateWithNativeHandle
@@ -403,6 +404,8 @@ EXPORTS
 	urPrintMemoryScopeCapabilityFlags
 	urPrintPhysicalMemCreateParams
 	urPrintPhysicalMemFlags
+	urPrintPhysicalMemGetInfoParams
+	urPrintPhysicalMemInfo
 	urPrintPhysicalMemProperties
 	urPrintPhysicalMemReleaseParams
 	urPrintPhysicalMemRetainParams
diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in
index 706d28dd01..00a6de8c10 100644
--- a/source/loader/loader.map.in
+++ b/source/loader/loader.map.in
@@ -161,6 +161,7 @@
 		urMemRelease;
 		urMemRetain;
 		urPhysicalMemCreate;
+		urPhysicalMemGetInfo;
 		urPhysicalMemRelease;
 		urPhysicalMemRetain;
 		urPlatformCreateWithNativeHandle;
@@ -403,6 +404,8 @@
 		urPrintMemoryScopeCapabilityFlags;
 		urPrintPhysicalMemCreateParams;
 		urPrintPhysicalMemFlags;
+		urPrintPhysicalMemGetInfoParams;
+		urPrintPhysicalMemInfo;
 		urPrintPhysicalMemProperties;
 		urPrintPhysicalMemReleaseParams;
 		urPrintPhysicalMemRetainParams;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index c482fbdfcc..c74b9d6caf 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -2515,6 +2515,90 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urPhysicalMemGetInfo
+__urdlllocal ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    [[maybe_unused]] auto context = getContext();
+
+    // extract platform's function pointer table
+    auto dditable =
+        reinterpret_cast<ur_physical_mem_object_t *>(hPhysicalMem)->dditable;
+    auto pfnGetInfo = dditable->ur.PhysicalMem.pfnGetInfo;
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hPhysicalMem =
+        reinterpret_cast<ur_physical_mem_object_t *>(hPhysicalMem)->handle;
+
+    // this value is needed for converting adapter handles to loader handles
+    size_t sizeret = 0;
+    if (pPropSizeRet == NULL) {
+        pPropSizeRet = &sizeret;
+    }
+
+    // forward to device-platform
+    result =
+        pfnGetInfo(hPhysicalMem, propName, propSize, pPropValue, pPropSizeRet);
+
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        if (pPropValue != nullptr) {
+            switch (propName) {
+            case UR_PHYSICAL_MEM_INFO_CONTEXT: {
+                ur_context_handle_t *handles =
+                    reinterpret_cast<ur_context_handle_t *>(pPropValue);
+                size_t nelements = *pPropSizeRet / sizeof(ur_context_handle_t);
+                for (size_t i = 0; i < nelements; ++i) {
+                    if (handles[i] != nullptr) {
+                        handles[i] = reinterpret_cast<ur_context_handle_t>(
+                            context->factories.ur_context_factory.getInstance(
+                                handles[i], dditable));
+                    }
+                }
+            } break;
+            case UR_PHYSICAL_MEM_INFO_DEVICE: {
+                ur_device_handle_t *handles =
+                    reinterpret_cast<ur_device_handle_t *>(pPropValue);
+                size_t nelements = *pPropSizeRet / sizeof(ur_device_handle_t);
+                for (size_t i = 0; i < nelements; ++i) {
+                    if (handles[i] != nullptr) {
+                        handles[i] = reinterpret_cast<ur_device_handle_t>(
+                            context->factories.ur_device_factory.getInstance(
+                                handles[i], dditable));
+                    }
+                }
+            } break;
+            default: {
+            } break;
+            }
+        }
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramCreateWithIL
 __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL(
@@ -10354,6 +10438,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
             pDdiTable->pfnCreate = ur_loader::urPhysicalMemCreate;
             pDdiTable->pfnRetain = ur_loader::urPhysicalMemRetain;
             pDdiTable->pfnRelease = ur_loader::urPhysicalMemRelease;
+            pDdiTable->pfnGetInfo = ur_loader::urPhysicalMemGetInfo;
         } else {
             // return pointers directly to platform's DDIs
             *pDdiTable = ur_loader::getContext()
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 15c78956d8..bc7cfb7eec 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -2961,6 +2961,43 @@ ur_result_t UR_APICALL urPhysicalMemRelease(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about a physical memory object.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hPhysicalMem`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT < propName`
+ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+    ) try {
+    auto pfnGetInfo = ur_lib::getContext()->urDdiTable.PhysicalMem.pfnGetInfo;
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnGetInfo(hPhysicalMem, propName, propSize, pPropValue,
+                      pPropSizeRet);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Create a program object from input intermediate language.
 ///
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 690f562af4..02fd10f5f8 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -604,6 +604,14 @@ urPrintPhysicalMemProperties(const struct ur_physical_mem_properties_t params,
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintPhysicalMemInfo(enum ur_physical_mem_info_t value,
+                                   char *buffer, const size_t buff_size,
+                                   size_t *out_size) {
+    std::stringstream ss;
+    ss << value;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintProgramMetadataType(enum ur_program_metadata_type_t value,
                                        char *buffer, const size_t buff_size,
                                        size_t *out_size) {
@@ -2235,6 +2243,14 @@ ur_result_t urPrintPhysicalMemReleaseParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintPhysicalMemGetInfoParams(
+    const struct ur_physical_mem_get_info_params_t *params, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t
 urPrintPlatformGetParams(const struct ur_platform_get_params_t *params,
                          char *buffer, const size_t buff_size,
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 3925d5d160..444170c71d 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -2535,6 +2535,36 @@ ur_result_t UR_APICALL urPhysicalMemRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about a physical memory object.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hPhysicalMem`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT < propName`
+ur_result_t UR_APICALL urPhysicalMemGetInfo(
+    ur_physical_mem_handle_t
+        hPhysicalMem, ///< [in] handle of the physical memory object to query.
+    ur_physical_mem_info_t propName, ///< [in] type of the info to query.
+    size_t
+        propSize, ///< [in] size in bytes of the memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+    ///< the info. If propSize is less than the real number of bytes needed to
+    ///< return the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+    ///< returned and pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName."
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Create a program object from input intermediate language.
 ///
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index b1c90883d8..219434fb62 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -964,13 +964,9 @@ struct urPhysicalMemTest : urVirtualMemGranularityTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urVirtualMemGranularityTest::SetUp());
         size = granularity * 256;
-        ur_physical_mem_properties_t props{
-            UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES,
-            nullptr,
-            0 /*flags*/,
-        };
-        ASSERT_SUCCESS(
-            urPhysicalMemCreate(context, device, size, &props, &physical_mem));
+
+        ASSERT_SUCCESS(urPhysicalMemCreate(context, device, size, &properties,
+                                           &physical_mem));
         ASSERT_NE(physical_mem, nullptr);
     }
 
@@ -983,6 +979,11 @@ struct urPhysicalMemTest : urVirtualMemGranularityTest {
 
     size_t size = 0;
     ur_physical_mem_handle_t physical_mem = nullptr;
+    ur_physical_mem_properties_t properties{
+        UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES,
+        nullptr,
+        0 /*flags*/,
+    };
 };
 
 template <class T>
diff --git a/test/conformance/virtual_memory/CMakeLists.txt b/test/conformance/virtual_memory/CMakeLists.txt
index 6ae27f443b..d05db83bd5 100644
--- a/test/conformance/virtual_memory/CMakeLists.txt
+++ b/test/conformance/virtual_memory/CMakeLists.txt
@@ -7,6 +7,7 @@ add_conformance_test_with_devices_environment(virtual_memory
     urPhysicalMemCreate.cpp
     urPhysicalMemRelease.cpp
     urPhysicalMemRetain.cpp
+    urPhysicalMemGetInfo.cpp
     urVirtualMemFree.cpp
     urVirtualMemGetInfo.cpp
     urVirtualMemGranularityGetInfo.cpp
diff --git a/test/conformance/virtual_memory/urPhysicalMemCreate.cpp b/test/conformance/virtual_memory/urPhysicalMemCreate.cpp
index 236450a1ab..e7867a8a72 100644
--- a/test/conformance/virtual_memory/urPhysicalMemCreate.cpp
+++ b/test/conformance/virtual_memory/urPhysicalMemCreate.cpp
@@ -25,15 +25,49 @@ struct urPhysicalMemCreateTest
     ur_physical_mem_handle_t physical_mem = nullptr;
 };
 
-UUR_TEST_SUITE_P(urPhysicalMemCreateTest, ::testing::Values(1, 2, 3, 7, 12, 44),
+using urPhysicalMemCreateWithSizeParamTest = urPhysicalMemCreateTest;
+UUR_TEST_SUITE_P(urPhysicalMemCreateWithSizeParamTest,
+                 ::testing::Values(1, 2, 3, 7, 12, 44),
                  uur::deviceTestWithParamPrinter<size_t>);
 
-TEST_P(urPhysicalMemCreateTest, Success) {
+TEST_P(urPhysicalMemCreateWithSizeParamTest, Success) {
     ASSERT_SUCCESS(
         urPhysicalMemCreate(context, device, size, nullptr, &physical_mem));
     ASSERT_NE(physical_mem, nullptr);
 }
 
+TEST_P(urPhysicalMemCreateWithSizeParamTest, InvalidSize) {
+    if (granularity == 1) {
+        GTEST_SKIP()
+            << "A granularity of 1 means that any size will be accepted.";
+    }
+    size_t invalid_size = size - 1;
+    ASSERT_EQ_RESULT(urPhysicalMemCreate(context, device, invalid_size, nullptr,
+                                         &physical_mem),
+                     UR_RESULT_ERROR_INVALID_SIZE);
+}
+
+using urPhysicalMemCreateWithFlagsParamTest =
+    uur::urPhysicalMemTestWithParam<ur_physical_mem_flags_t>;
+UUR_TEST_SUITE_P(urPhysicalMemCreateWithFlagsParamTest,
+                 ::testing::Values(UR_PHYSICAL_MEM_FLAG_TBD),
+                 uur::deviceTestWithParamPrinter<ur_physical_mem_flags_t>);
+
+TEST_P(urPhysicalMemCreateWithFlagsParamTest, Success) {
+    ur_physical_mem_properties_t properties;
+    properties.stype = UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES;
+    properties.pNext = nullptr;
+    properties.flags = getParam();
+
+    ASSERT_SUCCESS(
+        urPhysicalMemCreate(context, device, size, &properties, &physical_mem));
+    ASSERT_NE(physical_mem, nullptr);
+}
+
+using urPhysicalMemCreateTest = urPhysicalMemCreateTest;
+UUR_TEST_SUITE_P(urPhysicalMemCreateTest, ::testing::Values(1),
+                 uur::deviceTestWithParamPrinter<size_t>);
+
 TEST_P(urPhysicalMemCreateTest, InvalidNullHandleContext) {
     ASSERT_EQ_RESULT(
         urPhysicalMemCreate(nullptr, device, size, nullptr, &physical_mem),
@@ -52,13 +86,13 @@ TEST_P(urPhysicalMemCreateTest, InvalidNullPointerPhysicalMem) {
         UR_RESULT_ERROR_INVALID_NULL_POINTER);
 }
 
-TEST_P(urPhysicalMemCreateTest, InvalidSize) {
-    if (granularity == 1) {
-        GTEST_SKIP()
-            << "A granularity of 1 means that any size will be accepted.";
-    }
-    size_t invalid_size = size - 1;
-    ASSERT_EQ_RESULT(urPhysicalMemCreate(context, device, invalid_size, nullptr,
-                                         &physical_mem),
-                     UR_RESULT_ERROR_INVALID_SIZE);
+TEST_P(urPhysicalMemCreateTest, InvalidEnumeration) {
+    ur_physical_mem_properties_t properties;
+    properties.stype = UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES;
+    properties.pNext = nullptr;
+    properties.flags = UR_PHYSICAL_MEM_FLAG_FORCE_UINT32;
+
+    ASSERT_EQ_RESULT(
+        urPhysicalMemCreate(context, device, size, &properties, &physical_mem),
+        UR_RESULT_ERROR_INVALID_ENUMERATION);
 }
diff --git a/test/conformance/virtual_memory/urPhysicalMemGetInfo.cpp b/test/conformance/virtual_memory/urPhysicalMemGetInfo.cpp
new file mode 100644
index 0000000000..ca2595d0fa
--- /dev/null
+++ b/test/conformance/virtual_memory/urPhysicalMemGetInfo.cpp
@@ -0,0 +1,93 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include <uur/fixtures.h>
+
+using urPhysicalMemGetInfoTest = uur::urPhysicalMemTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urPhysicalMemGetInfoTest);
+
+TEST_P(urPhysicalMemGetInfoTest, Context) {
+    size_t info_size = 0;
+
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(
+        physical_mem, UR_PHYSICAL_MEM_INFO_CONTEXT, 0, nullptr, &info_size));
+    ASSERT_NE(info_size, 0);
+
+    std::vector<uint8_t> data(info_size);
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem,
+                                        UR_PHYSICAL_MEM_INFO_CONTEXT,
+                                        data.size(), data.data(), nullptr));
+
+    auto returned_context =
+        reinterpret_cast<ur_context_handle_t *>(data.data());
+    ASSERT_EQ(context, *returned_context);
+}
+
+TEST_P(urPhysicalMemGetInfoTest, Device) {
+    size_t info_size = 0;
+
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(
+        physical_mem, UR_PHYSICAL_MEM_INFO_DEVICE, 0, nullptr, &info_size));
+    ASSERT_NE(info_size, 0);
+
+    std::vector<uint8_t> data(info_size);
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem,
+                                        UR_PHYSICAL_MEM_INFO_DEVICE,
+                                        data.size(), data.data(), nullptr));
+
+    auto returned_device = reinterpret_cast<ur_device_handle_t *>(data.data());
+    ASSERT_EQ(device, *returned_device);
+}
+
+TEST_P(urPhysicalMemGetInfoTest, Size) {
+    size_t info_size = 0;
+
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_SIZE,
+                                        0, nullptr, &info_size));
+    ASSERT_NE(info_size, 0);
+
+    std::vector<uint8_t> data(info_size);
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_SIZE,
+                                        data.size(), data.data(), nullptr));
+
+    auto returned_size = reinterpret_cast<size_t *>(data.data());
+    ASSERT_EQ(size, *returned_size);
+}
+
+TEST_P(urPhysicalMemGetInfoTest, Properties) {
+    size_t info_size = 0;
+
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(
+        physical_mem, UR_PHYSICAL_MEM_INFO_PROPERTIES, 0, nullptr, &info_size));
+    ASSERT_NE(info_size, 0);
+
+    std::vector<uint8_t> data(info_size);
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem,
+                                        UR_PHYSICAL_MEM_INFO_PROPERTIES,
+                                        data.size(), data.data(), nullptr));
+
+    auto returned_properties =
+        reinterpret_cast<ur_physical_mem_properties_t *>(data.data());
+    ASSERT_EQ(properties.stype, returned_properties->stype);
+    ASSERT_EQ(properties.pNext, returned_properties->pNext);
+    ASSERT_EQ(properties.flags, returned_properties->flags);
+}
+
+TEST_P(urPhysicalMemGetInfoTest, ReferenceCount) {
+    size_t info_size = 0;
+
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem,
+                                        UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT, 0,
+                                        nullptr, &info_size));
+    ASSERT_NE(info_size, 0);
+
+    std::vector<uint8_t> data(info_size);
+    ASSERT_SUCCESS(urPhysicalMemGetInfo(physical_mem,
+                                        UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                                        data.size(), data.data(), nullptr));
+
+    const size_t ReferenceCount =
+        *reinterpret_cast<const uint32_t *>(data.data());
+    ASSERT_EQ(ReferenceCount, 1);
+}
diff --git a/test/conformance/virtual_memory/urPhysicalMemRelease.cpp b/test/conformance/virtual_memory/urPhysicalMemRelease.cpp
index 834a15e50f..e7a7e3855c 100644
--- a/test/conformance/virtual_memory/urPhysicalMemRelease.cpp
+++ b/test/conformance/virtual_memory/urPhysicalMemRelease.cpp
@@ -8,8 +8,23 @@ using urPhysicalMemReleaseTest = uur::urPhysicalMemTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urPhysicalMemReleaseTest);
 
 TEST_P(urPhysicalMemReleaseTest, Success) {
+    uint32_t referenceCount = 0;
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_GE(referenceCount, 1);
+
     ASSERT_SUCCESS(urPhysicalMemRetain(physical_mem));
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_EQ(referenceCount, 2);
+
     ASSERT_SUCCESS(urPhysicalMemRelease(physical_mem));
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_EQ(referenceCount, 1);
 }
 
 TEST_P(urPhysicalMemReleaseTest, InvalidNullHandlePhysicalMem) {
diff --git a/test/conformance/virtual_memory/urPhysicalMemRetain.cpp b/test/conformance/virtual_memory/urPhysicalMemRetain.cpp
index a438e1072d..4e8883fa2c 100644
--- a/test/conformance/virtual_memory/urPhysicalMemRetain.cpp
+++ b/test/conformance/virtual_memory/urPhysicalMemRetain.cpp
@@ -8,8 +8,23 @@ using urPhysicalMemRetainTest = uur::urPhysicalMemTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urPhysicalMemRetainTest);
 
 TEST_P(urPhysicalMemRetainTest, Success) {
+    uint32_t referenceCount = 0;
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_GE(referenceCount, 1);
+
     ASSERT_SUCCESS(urPhysicalMemRetain(physical_mem));
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_EQ(referenceCount, 2);
+
     ASSERT_SUCCESS(urPhysicalMemRelease(physical_mem));
+    ASSERT_SUCCESS(
+        urPhysicalMemGetInfo(physical_mem, UR_PHYSICAL_MEM_INFO_REFERENCE_COUNT,
+                             sizeof(referenceCount), &referenceCount, nullptr));
+    ASSERT_EQ(referenceCount, 1);
 }
 
 TEST_P(urPhysicalMemRetainTest, InvalidNullHandlePhysicalMem) {
diff --git a/test/conformance/virtual_memory/urVirtualMemMap.cpp b/test/conformance/virtual_memory/urVirtualMemMap.cpp
index bed65e2018..f6ed600e0a 100644
--- a/test/conformance/virtual_memory/urVirtualMemMap.cpp
+++ b/test/conformance/virtual_memory/urVirtualMemMap.cpp
@@ -4,15 +4,23 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include <uur/fixtures.h>
 
-using urVirtualMemMapTest = uur::urVirtualMemTest;
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urVirtualMemMapTest);
+using urVirtualMemMapWithFlagsTest =
+    uur::urVirtualMemTestWithParam<ur_virtual_mem_access_flag_t>;
+UUR_TEST_SUITE_P(urVirtualMemMapWithFlagsTest,
+                 ::testing::Values(UR_VIRTUAL_MEM_ACCESS_FLAG_NONE,
+                                   UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE,
+                                   UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY),
+                 uur::deviceTestWithParamPrinter<ur_virtual_mem_access_flag_t>);
 
-TEST_P(urVirtualMemMapTest, Success) {
+TEST_P(urVirtualMemMapWithFlagsTest, Success) {
     ASSERT_SUCCESS(urVirtualMemMap(context, virtual_ptr, size, physical_mem, 0,
-                                   UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE));
+                                   getParam()));
     EXPECT_SUCCESS(urVirtualMemUnmap(context, virtual_ptr, size));
 }
 
+using urVirtualMemMapTest = uur::urVirtualMemTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urVirtualMemMapTest);
+
 TEST_P(urVirtualMemMapTest, InvalidNullHandleContext) {
     ASSERT_EQ_RESULT(urVirtualMemMap(nullptr, virtual_ptr, size, physical_mem,
                                      0, UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE),
diff --git a/test/conformance/virtual_memory/urVirtualMemSetAccess.cpp b/test/conformance/virtual_memory/urVirtualMemSetAccess.cpp
index 7b06ffb6ba..2010bbd9fc 100644
--- a/test/conformance/virtual_memory/urVirtualMemSetAccess.cpp
+++ b/test/conformance/virtual_memory/urVirtualMemSetAccess.cpp
@@ -4,20 +4,32 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include <uur/fixtures.h>
 
-using urVirtualMemSetAccessTest = uur::urVirtualMemMappedTest;
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urVirtualMemSetAccessTest);
+using urVirtualMemSetAccessWithFlagsTest =
+    uur::urVirtualMemMappedTestWithParam<ur_virtual_mem_access_flag_t>;
+UUR_TEST_SUITE_P(urVirtualMemSetAccessWithFlagsTest,
+                 ::testing::Values(UR_VIRTUAL_MEM_ACCESS_FLAG_NONE,
+                                   UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE,
+                                   UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY),
+                 uur::deviceTestWithParamPrinter<ur_virtual_mem_access_flag_t>);
 
-TEST_P(urVirtualMemSetAccessTest, Success) {
-    ASSERT_SUCCESS(urVirtualMemSetAccess(context, virtual_ptr, size,
-                                         UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY));
+TEST_P(urVirtualMemSetAccessWithFlagsTest, Success) {
+    ASSERT_SUCCESS(
+        urVirtualMemSetAccess(context, virtual_ptr, size, getParam()));
 
     ur_virtual_mem_access_flags_t flags = 0;
     ASSERT_SUCCESS(urVirtualMemGetInfo(context, virtual_ptr, size,
                                        UR_VIRTUAL_MEM_INFO_ACCESS_MODE,
                                        sizeof(flags), &flags, nullptr));
-    ASSERT_TRUE(flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY);
+    if (getParam() == UR_VIRTUAL_MEM_ACCESS_FLAG_NONE) {
+        ASSERT_TRUE(flags == 0 || flags == UR_VIRTUAL_MEM_ACCESS_FLAG_NONE);
+    } else {
+        ASSERT_TRUE(flags & getParam());
+    }
 }
 
+using urVirtualMemSetAccessTest = uur::urVirtualMemMappedTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urVirtualMemSetAccessTest);
+
 TEST_P(urVirtualMemSetAccessTest, InvalidNullHandleContext) {
     ASSERT_EQ_RESULT(
         urVirtualMemSetAccess(nullptr, virtual_ptr, size,
@@ -31,3 +43,10 @@ TEST_P(urVirtualMemSetAccessTest, InvalidNullPointerStart) {
                               UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY),
         UR_RESULT_ERROR_INVALID_NULL_POINTER);
 }
+
+TEST_P(urVirtualMemSetAccessTest, InvalidEnumeration) {
+    ASSERT_EQ_RESULT(
+        urVirtualMemSetAccess(context, virtual_ptr, size,
+                              UR_VIRTUAL_MEM_ACCESS_FLAG_FORCE_UINT32),
+        UR_RESULT_ERROR_INVALID_ENUMERATION);
+}
diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
index 627d8eaa78..633aa41f90 100644
--- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
+++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
@@ -1,5 +1,6 @@
-{{OPT}}urPhysicalMemCreateTest.Success/*__3
-{{OPT}}urPhysicalMemCreateTest.Success/*__7
-{{OPT}}urPhysicalMemCreateTest.Success/*__12
-urPhysicalMemCreateTest.Success/*__44
-urPhysicalMemCreateTest.InvalidSize/*
+{{OPT}}urPhysicalMemCreateWithSizeParamTest.Success/*
+urPhysicalMemCreateWithSizeParamTest.InvalidSize/*
+urPhysicalMemGetInfoTest.Context/*
+urPhysicalMemGetInfoTest.Device/*
+urPhysicalMemGetInfoTest.Size/*
+urPhysicalMemGetInfoTest.Properties/*
diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
index ec7be06f7e..531773a246 100644
--- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
+++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
@@ -1,12 +1,19 @@
-urPhysicalMemCreateTest.Success/*
+urPhysicalMemCreateWithSizeParamTest.Success/*
+urPhysicalMemCreateWithFlagsParamTest.Success/*
 urPhysicalMemCreateTest.InvalidNullHandleContext/*
 urPhysicalMemCreateTest.InvalidNullHandleDevice/*
 urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/*
-urPhysicalMemCreateTest.InvalidSize/*
+urPhysicalMemCreateTest.InvalidEnumeration/*
+urPhysicalMemCreateWithSizeParamTest.InvalidSize/*
 urPhysicalMemReleaseTest.Success/*
 urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/*
 urPhysicalMemRetainTest.Success/*
 urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/*
+urPhysicalMemGetInfoTest.Context/*
+urPhysicalMemGetInfoTest.Device/*
+urPhysicalMemGetInfoTest.Size/*
+urPhysicalMemGetInfoTest.Properties/*
+urPhysicalMemGetInfoTest.ReferenceCount/*
 urVirtualMemFreeTest.Success/*
 urVirtualMemFreeTest.InvalidNullHandleContext/*
 urVirtualMemFreeTest.InvalidNullPointerStart/*
@@ -17,7 +24,7 @@ urVirtualMemGetInfoTest.InvalidEnumerationInfo/*
 urVirtualMemGranularityGetInfoTest.Success/*__UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM
 urVirtualMemGranularityGetInfoTest.Success/*__UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED
 urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/*
-urVirtualMemMapTest.Success/*
+urVirtualMemMapWithFlagsTest.Success/*
 urVirtualMemMapTest.InvalidNullHandleContext/*
 urVirtualMemMapTest.InvalidNullHandlePhysicalMem/*
 urVirtualMemMapTest.InvalidNullPointerStart/*
@@ -26,9 +33,10 @@ urVirtualMemReserveTestWithParam.SuccessNoStartPointer/*
 urVirtualMemReserveTestWithParam.SuccessWithStartPointer/*
 urVirtualMemReserveTest.InvalidNullHandleContext/*
 urVirtualMemReserveTest.InvalidNullPointer/*
-urVirtualMemSetAccessTest.Success/*
+urVirtualMemSetAccessWithFlagsTest.Success/*
 urVirtualMemSetAccessTest.InvalidNullHandleContext/*
 urVirtualMemSetAccessTest.InvalidNullPointerStart/*
+urVirtualMemSetAccessTest.InvalidEnumeration/*
 urVirtualMemUnmapTest.Success/*
 urVirtualMemUnmapTest.InvalidNullHandleContext/*
 urVirtualMemUnmapTest.InvalidNullPointerStart/*

From 9af90a55fa8b7ff93eaedfc6fc939d1d345e2fd0 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 18 Nov 2024 11:17:18 +0000
Subject: [PATCH 111/148] Add try catches to anything that can throw

UR_CHECK_ERROR must be called within a try catch block, as it can throw.

For functions that don't return a `ur_result_handle_t`, and which might
throw, make sure these funcs are only called within try catches in the
caller funcs, and that they are not exposed to the user.
---
 source/adapters/cuda/command_buffer.cpp | 865 ++++++++++++------------
 source/adapters/hip/command_buffer.cpp  | 235 ++++---
 2 files changed, 543 insertions(+), 557 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index b60d2944b1..dbaadbfbd0 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -21,7 +21,7 @@
 
 namespace {
 ur_result_t
-commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) {
+commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) try {
   if (CommandBuffer->decrementInternalReferenceCount() != 0) {
     return UR_RESULT_SUCCESS;
   }
@@ -36,10 +36,12 @@ commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) {
 
   delete CommandBuffer;
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
-ur_result_t
-commandHandleReleaseInternal(ur_exp_command_buffer_command_handle_t Command) {
+ur_result_t commandHandleReleaseInternal(
+    ur_exp_command_buffer_command_handle_t Command) try {
   if (Command->decrementInternalReferenceCount() != 0) {
     return UR_RESULT_SUCCESS;
   }
@@ -60,6 +62,8 @@ commandHandleReleaseInternal(ur_exp_command_buffer_command_handle_t Command) {
 
   delete Command;
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 } // end anonymous namespace
 
@@ -82,6 +86,7 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
   UR_TRACE(urDeviceRelease(Device));
 }
 
+// This may throw so it must be called from within a try...catch
 std::unique_ptr<ur_event_handle_t_>
 ur_exp_command_buffer_handle_t_::addSignalNode(CUgraphNode DepNode,
                                                CUgraphNode &SignalNode) {
@@ -96,7 +101,7 @@ ur_exp_command_buffer_handle_t_::addSignalNode(CUgraphNode DepNode,
 
 ur_result_t ur_exp_command_buffer_handle_t_::addWaitNodes(
     std::vector<CUgraphNode> &DepsList, uint32_t NumEventsInWaitList,
-    const ur_event_handle_t *EventWaitList) {
+    const ur_event_handle_t *EventWaitList) try {
   std::vector<CUgraphNode> WaitNodes(NumEventsInWaitList);
   for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
     CUevent Event = EventWaitList[i]->get();
@@ -107,6 +112,8 @@ ur_result_t ur_exp_command_buffer_handle_t_::addWaitNodes(
   // nodes created.
   DepsList = std::move(WaitNodes);
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 kernel_command_handle::kernel_command_handle(
@@ -219,7 +226,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_exp_command_buffer_sync_point_t *RetSyncPoint,
     ur_event_handle_t *RetEvent,
-    ur_exp_command_buffer_command_handle_t *RetCommand) {
+    ur_exp_command_buffer_command_handle_t *RetCommand) try {
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
                                         SyncPointWaitList, DepsList));
@@ -229,128 +236,124 @@ static ur_result_t enqueueCommandBufferFillHelper(
                                                EventWaitList));
   }
 
-  try {
-    // Graph node added to graph, if multiple nodes are created this will
-    // be set to the leaf node
-    CUgraphNode GraphNode;
-
-    const size_t N = Size / PatternSize;
-    auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
-                      ? *static_cast<CUdeviceptr *>(DstDevice)
-                      : (CUdeviceptr)DstDevice;
-
-    if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
-      CUDA_MEMSET_NODE_PARAMS NodeParams = {};
-      NodeParams.dst = DstPtr;
-      NodeParams.elementSize = PatternSize;
-      NodeParams.height = N;
-      NodeParams.pitch = PatternSize;
-      NodeParams.width = 1;
-
-      // pattern size in bytes
-      switch (PatternSize) {
-      case 1: {
-        auto Value = *static_cast<const uint8_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      case 2: {
-        auto Value = *static_cast<const uint16_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      case 4: {
-        auto Value = *static_cast<const uint32_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      }
+  // Graph node added to graph, if multiple nodes are created this will
+  // be set to the leaf node
+  CUgraphNode GraphNode;
 
-      UR_CHECK_ERROR(
-          cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph,
-                               DepsList.data(), DepsList.size(), &NodeParams,
-                               CommandBuffer->Device->getNativeContext()));
-    } else {
-      // CUDA has no memset functions that allow setting values more than 4
-      // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
-      // fill, which can be more than 4 bytes. We must break up the pattern
-      // into 1 byte values, and set the buffer using multiple strided calls.
-      // This means that one cuGraphAddMemsetNode call is made for every 1
-      // bytes in the pattern.
+  const size_t N = Size / PatternSize;
+  auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
+                    ? *static_cast<CUdeviceptr *>(DstDevice)
+                    : (CUdeviceptr)DstDevice;
+
+  if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
+    CUDA_MEMSET_NODE_PARAMS NodeParams = {};
+    NodeParams.dst = DstPtr;
+    NodeParams.elementSize = PatternSize;
+    NodeParams.height = N;
+    NodeParams.pitch = PatternSize;
+    NodeParams.width = 1;
+
+    // pattern size in bytes
+    switch (PatternSize) {
+    case 1: {
+      auto Value = *static_cast<const uint8_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    case 2: {
+      auto Value = *static_cast<const uint16_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    case 4: {
+      auto Value = *static_cast<const uint32_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    }
 
-      size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+    UR_CHECK_ERROR(cuGraphAddMemsetNode(
+        &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, CommandBuffer->Device->getNativeContext()));
+  } else {
+    // CUDA has no memset functions that allow setting values more than 4
+    // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
+    // fill, which can be more than 4 bytes. We must break up the pattern
+    // into 1 byte values, and set the buffer using multiple strided calls.
+    // This means that one cuGraphAddMemsetNode call is made for every 1
+    // bytes in the pattern.
+
+    size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+
+    // Update NodeParam
+    CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
+    NodeParamsStepFirst.dst = DstPtr;
+    NodeParamsStepFirst.elementSize = sizeof(uint32_t);
+    NodeParamsStepFirst.height = Size / sizeof(uint32_t);
+    NodeParamsStepFirst.pitch = sizeof(uint32_t);
+    NodeParamsStepFirst.value = *static_cast<const uint32_t *>(Pattern);
+    NodeParamsStepFirst.width = 1;
+
+    UR_CHECK_ERROR(cuGraphAddMemsetNode(
+        &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParamsStepFirst, CommandBuffer->Device->getNativeContext()));
+
+    DepsList.clear();
+    DepsList.push_back(GraphNode);
+
+    // we walk up the pattern in 1-byte steps, and call cuMemset for each
+    // 1-byte chunk of the pattern.
+    for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
+      // take 4 bytes of the pattern
+      auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);
+
+      // offset the pointer to the part of the buffer we want to write to
+      auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t));
 
       // Update NodeParam
-      CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
-      NodeParamsStepFirst.dst = DstPtr;
-      NodeParamsStepFirst.elementSize = sizeof(uint32_t);
-      NodeParamsStepFirst.height = Size / sizeof(uint32_t);
-      NodeParamsStepFirst.pitch = sizeof(uint32_t);
-      NodeParamsStepFirst.value = *static_cast<const uint32_t *>(Pattern);
-      NodeParamsStepFirst.width = 1;
+      CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
+      NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
+      NodeParamsStep.elementSize = sizeof(uint8_t);
+      NodeParamsStep.height = Size / NumberOfSteps;
+      NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
+      NodeParamsStep.value = Value;
+      NodeParamsStep.width = 1;
 
       UR_CHECK_ERROR(cuGraphAddMemsetNode(
           &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
-          DepsList.size(), &NodeParamsStepFirst,
+          DepsList.size(), &NodeParamsStep,
           CommandBuffer->Device->getNativeContext()));
 
       DepsList.clear();
       DepsList.push_back(GraphNode);
-
-      // we walk up the pattern in 1-byte steps, and call cuMemset for each
-      // 1-byte chunk of the pattern.
-      for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
-        // take 4 bytes of the pattern
-        auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);
-
-        // offset the pointer to the part of the buffer we want to write to
-        auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t));
-
-        // Update NodeParam
-        CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
-        NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
-        NodeParamsStep.elementSize = sizeof(uint8_t);
-        NodeParamsStep.height = Size / NumberOfSteps;
-        NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
-        NodeParamsStep.value = Value;
-        NodeParamsStep.width = 1;
-
-        UR_CHECK_ERROR(cuGraphAddMemsetNode(
-            &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
-            DepsList.size(), &NodeParamsStep,
-            CommandBuffer->Device->getNativeContext()));
-
-        DepsList.clear();
-        DepsList.push_back(GraphNode);
-      }
     }
+  }
 
-    CUgraphNode SignalNode = nullptr;
-    if (RetEvent) {
-      auto SignalEvent = CommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *RetEvent = SignalEvent.release();
-    }
+  CUgraphNode SignalNode = nullptr;
+  if (RetEvent) {
+    auto SignalEvent = CommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *RetEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = CommandBuffer->addSyncPoint(SyncPointNode);
-    if (RetSyncPoint) {
-      *RetSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = CommandBuffer->addSyncPoint(SyncPointNode);
+  if (RetSyncPoint) {
+    *RetSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        NumEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes);
-    CommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      NumEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes);
+  CommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (RetCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *RetCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (RetCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *RetCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
@@ -560,7 +563,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -571,43 +574,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
                                                 phEventWaitList));
   }
 
-  try {
-    CUDA_MEMCPY3D NodeParams = {};
-    setCopyParams(pSrc, CU_MEMORYTYPE_HOST, pDst, CU_MEMORYTYPE_HOST, size,
-                  NodeParams);
+  CUDA_MEMCPY3D NodeParams = {};
+  setCopyParams(pSrc, CU_MEMORYTYPE_HOST, pDst, CU_MEMORYTYPE_HOST, size,
+                NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new usm_memcpy_command_handle(hCommandBuffer, GraphNode,
-                                                    SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new usm_memcpy_command_handle(hCommandBuffer, GraphNode,
+                                                  SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
@@ -617,7 +618,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
 
@@ -634,48 +635,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto Src = std::get<BufferMem>(hSrcMem->Mem)
-                   .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
-    auto Dst = std::get<BufferMem>(hDstMem->Mem)
-                   .getPtrWithOffset(hCommandBuffer->Device, dstOffset);
+  auto Src = std::get<BufferMem>(hSrcMem->Mem)
+                 .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
+  auto Dst = std::get<BufferMem>(hDstMem->Mem)
+                 .getPtrWithOffset(hCommandBuffer->Device, dstOffset);
 
-    CUDA_MEMCPY3D NodeParams = {};
-    setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
-                  NodeParams);
+  CUDA_MEMCPY3D NodeParams = {};
+  setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
+                NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_copy_command_handle(hCommandBuffer, GraphNode,
-                                                     SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_copy_command_handle(hCommandBuffer, GraphNode,
+                                                   SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
@@ -687,7 +686,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -698,49 +697,47 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto SrcPtr =
-        std::get<BufferMem>(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
-    auto DstPtr =
-        std::get<BufferMem>(hDstMem->Mem).getPtr(hCommandBuffer->Device);
-    CUDA_MEMCPY3D NodeParams = {};
+  auto SrcPtr =
+      std::get<BufferMem>(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
+  auto DstPtr =
+      std::get<BufferMem>(hDstMem->Mem).getPtr(hCommandBuffer->Device);
+  CUDA_MEMCPY3D NodeParams = {};
 
-    setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
-                      srcRowPitch, srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE,
-                      dstOrigin, dstRowPitch, dstSlicePitch, NodeParams);
+  setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
+                    srcRowPitch, srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE,
+                    dstOrigin, dstRowPitch, dstSlicePitch, NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_copy_rect_command_handle(
-        hCommandBuffer, GraphNode, SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_copy_rect_command_handle(
+      hCommandBuffer, GraphNode, SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT
@@ -751,7 +748,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -762,46 +759,44 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto Dst = std::get<BufferMem>(hBuffer->Mem)
-                   .getPtrWithOffset(hCommandBuffer->Device, offset);
+  auto Dst = std::get<BufferMem>(hBuffer->Mem)
+                 .getPtrWithOffset(hCommandBuffer->Device, offset);
 
-    CUDA_MEMCPY3D NodeParams = {};
-    setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
-                  NodeParams);
+  CUDA_MEMCPY3D NodeParams = {};
+  setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
+                NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_write_command_handle(hCommandBuffer, GraphNode,
-                                                      SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_write_command_handle(hCommandBuffer, GraphNode,
+                                                    SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT
@@ -811,7 +806,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -822,46 +817,44 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto Src = std::get<BufferMem>(hBuffer->Mem)
-                   .getPtrWithOffset(hCommandBuffer->Device, offset);
+  auto Src = std::get<BufferMem>(hBuffer->Mem)
+                 .getPtrWithOffset(hCommandBuffer->Device, offset);
 
-    CUDA_MEMCPY3D NodeParams = {};
-    setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
-                  NodeParams);
+  CUDA_MEMCPY3D NodeParams = {};
+  setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
+                NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_read_command_handle(hCommandBuffer, GraphNode,
-                                                     SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_read_command_handle(hCommandBuffer, GraphNode,
+                                                   SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT
@@ -874,7 +867,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -885,48 +878,45 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto DstPtr =
-        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
-    CUDA_MEMCPY3D NodeParams = {};
+  auto DstPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
+  CUDA_MEMCPY3D NodeParams = {};
 
-    setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset,
-                      hostRowPitch, hostSlicePitch, &DstPtr,
-                      CU_MEMORYTYPE_DEVICE, bufferOffset, bufferRowPitch,
-                      bufferSlicePitch, NodeParams);
+  setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset, hostRowPitch,
+                    hostSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
+                    bufferRowPitch, bufferSlicePitch, NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_write_rect_command_handle(
-        hCommandBuffer, GraphNode, SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_write_rect_command_handle(
+      hCommandBuffer, GraphNode, SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT
@@ -939,7 +929,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   CUgraphNode GraphNode;
   std::vector<CUgraphNode> DepsList;
   UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
@@ -950,48 +940,45 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
                                                 phEventWaitList));
   }
 
-  try {
-    auto SrcPtr =
-        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
-    CUDA_MEMCPY3D NodeParams = {};
+  auto SrcPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
+  CUDA_MEMCPY3D NodeParams = {};
 
-    setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
-                      bufferRowPitch, bufferSlicePitch, pDst,
-                      CU_MEMORYTYPE_HOST, hostOffset, hostRowPitch,
-                      hostSlicePitch, NodeParams);
+  setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
+                    bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST,
+                    hostOffset, hostRowPitch, hostSlicePitch, NodeParams);
 
-    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
-        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getNativeContext()));
+  UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+      &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+      &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new buffer_read_rect_command_handle(
-        hCommandBuffer, GraphNode, SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new buffer_read_rect_command_handle(
+      hCommandBuffer, GraphNode, SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
@@ -1001,7 +988,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   // Prefetch cmd is not supported by Cuda Graph.
   // We implement it as an empty node to enforce dependencies.
   CUgraphNode GraphNode;
@@ -1015,39 +1002,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
                                                 phEventWaitList));
   }
 
-  try {
-    // Add an empty node to preserve dependencies.
-    UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
-                                       DepsList.data(), DepsList.size()));
+  // Add an empty node to preserve dependencies.
+  UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                     DepsList.data(), DepsList.size()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new usm_prefetch_command_handle(hCommandBuffer, GraphNode,
-                                                      SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new usm_prefetch_command_handle(hCommandBuffer, GraphNode,
+                                                    SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
@@ -1057,7 +1042,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
   // Mem-Advise cmd is not supported by Cuda Graph.
   // We implement it as an empty node to enforce dependencies.
   CUgraphNode GraphNode;
@@ -1071,40 +1056,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
                                                 phEventWaitList));
   }
 
-  try {
-    // Add an empty node to preserve dependencies.
-    UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
-                                       DepsList.data(), DepsList.size()));
+  // Add an empty node to preserve dependencies.
+  UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                     DepsList.data(), DepsList.size()));
 
-    // Add signal node if external return event is used.
-    CUgraphNode SignalNode = nullptr;
-    if (phEvent) {
-      auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
-      *phEvent = SignalEvent.release();
-    }
+  // Add signal node if external return event is used.
+  CUgraphNode SignalNode = nullptr;
+  if (phEvent) {
+    auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode);
+    *phEvent = SignalEvent.release();
+  }
 
-    // Get sync point and register the cuNode with it.
-    CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
-    auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
-    if (pSyncPoint) {
-      *pSyncPoint = SyncPoint;
-    }
+  // Get sync point and register the cuNode with it.
+  CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode;
+  auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode);
+  if (pSyncPoint) {
+    *pSyncPoint = SyncPoint;
+  }
 
-    std::vector<CUgraphNode> WaitNodes =
-        numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-    auto NewCommand = new usm_advise_command_handle(hCommandBuffer, GraphNode,
-                                                    SignalNode, WaitNodes);
-    hCommandBuffer->CommandHandles.push_back(NewCommand);
+  std::vector<CUgraphNode> WaitNodes =
+      numEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
+  auto NewCommand = new usm_advise_command_handle(hCommandBuffer, GraphNode,
+                                                  SignalNode, WaitNodes);
+  hCommandBuffer->CommandHandles.push_back(NewCommand);
 
-    if (phCommand) {
-      NewCommand->incrementInternalReferenceCount();
-      *phCommand = NewCommand;
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phCommand) {
+    NewCommand->incrementInternalReferenceCount();
+    *phCommand = NewCommand;
   }
 
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
@@ -1158,38 +1141,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_event_handle_t *phEvent) {
+    ur_event_handle_t *phEvent) try {
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  ScopedContext Active(hQueue->getDevice());
+  uint32_t StreamToken;
+  ur_stream_guard_ Guard;
+  CUstream CuStream = hQueue->getNextComputeStream(
+      numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-  try {
-    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
-    ScopedContext Active(hQueue->getDevice());
-    uint32_t StreamToken;
-    ur_stream_guard_ Guard;
-    CUstream CuStream = hQueue->getNextComputeStream(
-        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
-
-    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                                     phEventWaitList));
+  UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                   phEventWaitList));
 
-    if (phEvent) {
-      RetImplEvent = std::unique_ptr<ur_event_handle_t_>(
-          ur_event_handle_t_::makeNative(UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
-                                         hQueue, CuStream, StreamToken));
-      UR_CHECK_ERROR(RetImplEvent->start());
-    }
+  if (phEvent) {
+    RetImplEvent = std::unique_ptr<ur_event_handle_t_>(
+        ur_event_handle_t_::makeNative(UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
+                                       hQueue, CuStream, StreamToken));
+    UR_CHECK_ERROR(RetImplEvent->start());
+  }
 
-    // Launch graph
-    UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream));
+  // Launch graph
+  UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream));
 
-    if (phEvent) {
-      UR_CHECK_ERROR(RetImplEvent->record());
-      *phEvent = RetImplEvent.release();
-    }
-  } catch (ur_result_t Err) {
-    return Err;
+  if (phEvent) {
+    UR_CHECK_ERROR(RetImplEvent->record());
+    *phEvent = RetImplEvent.release();
   }
-
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
@@ -1236,9 +1215,10 @@ validateCommandDesc(kernel_command_handle *Command,
 
 /**
  * Updates the arguments of a kernel command.
- * @param[in] Command The command associated with the kernel node being updated.
- * @param[in] UpdateCommandDesc The update command description that contains the
- * new arguments.
+ * @param[in] Command The command associated with the kernel node being
+ * updated.
+ * @param[in] UpdateCommandDesc The update command description that contains
+ * the new arguments.
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
 ur_result_t
@@ -1357,7 +1337,7 @@ updateCommand(kernel_command_handle *Command,
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     ur_exp_command_buffer_command_handle_t hCommand,
     const ur_exp_command_buffer_update_kernel_launch_desc_t
-        *pUpdateKernelLaunch) {
+        *pUpdateKernelLaunch) try {
 
   ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer;
 
@@ -1372,8 +1352,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   UR_CHECK_ERROR(
       updateKernelArguments(KernelCommandHandle, pUpdateKernelLaunch));
 
-  // If no work-size is provided make sure we pass nullptr to setKernelParams so
-  // it can guess the local work size.
+  // If no work-size is provided make sure we pass nullptr to setKernelParams
+  // so it can guess the local work size.
   const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize();
   size_t *LocalWorkSize =
       ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr;
@@ -1409,11 +1389,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
   UR_CHECK_ERROR(cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params));
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp(
     ur_exp_command_buffer_command_handle_t hCommand,
-    ur_event_handle_t *phEvent) {
+    ur_event_handle_t *phEvent) try {
   ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer;
 
   // Update requires command-buffer to be finalized
@@ -1426,8 +1408,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp(
     return UR_RESULT_ERROR_INVALID_OPERATION;
   }
 
-  // Error to try to update the signal event, when a signal event wasn't set on
-  // creation
+  // Error to try to update the signal event, when a signal event wasn't set
+  // on creation
   CUgraphNode SignalNode = hCommand->SignalNode;
   if (phEvent != nullptr && SignalNode == nullptr) {
     return UR_RESULT_ERROR_INVALID_OPERATION;
@@ -1444,11 +1426,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp(
   }
 
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp(
     ur_exp_command_buffer_command_handle_t hCommand,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList) {
+    uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList) try {
   ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer;
 
   // Update requires command-buffer to be finalized
@@ -1475,6 +1460,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp(
   }
 
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp(
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 538c2ff85a..86131a13ce 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -146,117 +146,115 @@ static ur_result_t enqueueCommandBufferFillHelper(
     const hipMemoryType DstType, const void *Pattern, size_t PatternSize,
     size_t Size, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) try {
   std::vector<hipGraphNode_t> DepsList;
 
   UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
                                         SyncPointWaitList, DepsList));
 
-  try {
-    // Graph node added to graph, if multiple nodes are created this will
-    // be set to the leaf node
-    hipGraphNode_t GraphNode;
+  // Graph node added to graph, if multiple nodes are created this will
+  // be set to the leaf node
+  hipGraphNode_t GraphNode;
 
-    const size_t N = Size / PatternSize;
-    auto DstPtr = DstType == hipMemoryTypeDevice
-                      ? *static_cast<hipDeviceptr_t *>(DstDevice)
-                      : DstDevice;
-
-    if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
-      hipMemsetParams NodeParams = {};
-      NodeParams.dst = DstPtr;
-      NodeParams.elementSize = PatternSize;
-      NodeParams.height = N;
-      NodeParams.pitch = PatternSize;
-      NodeParams.width = 1;
-
-      // pattern size in bytes
-      switch (PatternSize) {
-      case 1: {
-        auto Value = *static_cast<const uint8_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      case 2: {
-        auto Value = *static_cast<const uint16_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      case 4: {
-        auto Value = *static_cast<const uint32_t *>(Pattern);
-        NodeParams.value = Value;
-        break;
-      }
-      }
+  const size_t N = Size / PatternSize;
+  auto DstPtr = DstType == hipMemoryTypeDevice
+                    ? *static_cast<hipDeviceptr_t *>(DstDevice)
+                    : DstDevice;
+
+  if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
+    hipMemsetParams NodeParams = {};
+    NodeParams.dst = DstPtr;
+    NodeParams.elementSize = PatternSize;
+    NodeParams.height = N;
+    NodeParams.pitch = PatternSize;
+    NodeParams.width = 1;
+
+    // pattern size in bytes
+    switch (PatternSize) {
+    case 1: {
+      auto Value = *static_cast<const uint8_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    case 2: {
+      auto Value = *static_cast<const uint16_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    case 4: {
+      auto Value = *static_cast<const uint32_t *>(Pattern);
+      NodeParams.value = Value;
+      break;
+    }
+    }
 
-      UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph,
-                                           DepsList.data(), DepsList.size(),
-                                           &NodeParams));
+    UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph,
+                                         DepsList.data(), DepsList.size(),
+                                         &NodeParams));
 
-    } else {
-      // HIP has no memset functions that allow setting values more than 4
-      // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
-      // fill, which can be more than 4 bytes. We must break up the pattern
-      // into 1 byte values, and set the buffer using multiple strided calls.
-      // This means that one hipGraphAddMemsetNode call is made for every 1
-      // bytes in the pattern.
+  } else {
+    // HIP has no memset functions that allow setting values more than 4
+    // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
+    // fill, which can be more than 4 bytes. We must break up the pattern
+    // into 1 byte values, and set the buffer using multiple strided calls.
+    // This means that one hipGraphAddMemsetNode call is made for every 1
+    // bytes in the pattern.
+
+    size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+
+    // Update NodeParam
+    hipMemsetParams NodeParamsStepFirst = {};
+    NodeParamsStepFirst.dst = DstPtr;
+    NodeParamsStepFirst.elementSize = 4;
+    NodeParamsStepFirst.height = Size / sizeof(uint32_t);
+    NodeParamsStepFirst.pitch = 4;
+    NodeParamsStepFirst.value = *(static_cast<const uint32_t *>(Pattern));
+    NodeParamsStepFirst.width = 1;
+
+    UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph,
+                                         DepsList.data(), DepsList.size(),
+                                         &NodeParamsStepFirst));
+
+    DepsList.clear();
+    DepsList.push_back(GraphNode);
+
+    // we walk up the pattern in 1-byte steps, and add Memset node for each
+    // 1-byte chunk of the pattern.
+    for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
+      // take 1 bytes of the pattern
+      auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);
 
-      size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+      // offset the pointer to the part of the buffer we want to write to
+      auto OffsetPtr = reinterpret_cast<void *>(
+          reinterpret_cast<uint8_t *>(DstPtr) + (Step * sizeof(uint8_t)));
 
       // Update NodeParam
-      hipMemsetParams NodeParamsStepFirst = {};
-      NodeParamsStepFirst.dst = DstPtr;
-      NodeParamsStepFirst.elementSize = 4;
-      NodeParamsStepFirst.height = Size / sizeof(uint32_t);
-      NodeParamsStepFirst.pitch = 4;
-      NodeParamsStepFirst.value = *(static_cast<const uint32_t *>(Pattern));
-      NodeParamsStepFirst.width = 1;
+      hipMemsetParams NodeParamsStep = {};
+      NodeParamsStep.dst = reinterpret_cast<void *>(OffsetPtr);
+      NodeParamsStep.elementSize = sizeof(uint8_t);
+      NodeParamsStep.height = Size / NumberOfSteps;
+      NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
+      NodeParamsStep.value = Value;
+      NodeParamsStep.width = 1;
 
       UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph,
                                            DepsList.data(), DepsList.size(),
-                                           &NodeParamsStepFirst));
+                                           &NodeParamsStep));
 
       DepsList.clear();
       DepsList.push_back(GraphNode);
-
-      // we walk up the pattern in 1-byte steps, and add Memset node for each
-      // 1-byte chunk of the pattern.
-      for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
-        // take 1 bytes of the pattern
-        auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);
-
-        // offset the pointer to the part of the buffer we want to write to
-        auto OffsetPtr = reinterpret_cast<void *>(
-            reinterpret_cast<uint8_t *>(DstPtr) + (Step * sizeof(uint8_t)));
-
-        // Update NodeParam
-        hipMemsetParams NodeParamsStep = {};
-        NodeParamsStep.dst = reinterpret_cast<void *>(OffsetPtr);
-        NodeParamsStep.elementSize = sizeof(uint8_t);
-        NodeParamsStep.height = Size / NumberOfSteps;
-        NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
-        NodeParamsStep.value = Value;
-        NodeParamsStep.width = 1;
-
-        UR_CHECK_ERROR(hipGraphAddMemsetNode(
-            &GraphNode, CommandBuffer->HIPGraph, DepsList.data(),
-            DepsList.size(), &NodeParamsStep));
-
-        DepsList.clear();
-        DepsList.push_back(GraphNode);
-      }
-    }
-
-    // Get sync point and register the node with it.
-    auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode);
-    if (RetSyncPoint) {
-      *RetSyncPoint = SyncPoint;
     }
+  }
 
-  } catch (ur_result_t Err) {
-    return Err;
+  // Get sync point and register the node with it.
+  auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode);
+  if (RetSyncPoint) {
+    *RetSyncPoint = SyncPoint;
   }
+
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
@@ -438,10 +436,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
     UR_CHECK_ERROR(hipGraphAddMemcpyNode1D(&GraphNode, hCommandBuffer->HIPGraph,
                                            DepsList.data(), DepsList.size(),
                                            pDst, pSrc, size, hipMemcpyDefault));
@@ -479,10 +476,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   UR_ASSERT(size + srcOffset <= std::get<BufferMem>(hSrcMem->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto Src = std::get<BufferMem>(hSrcMem->Mem)
                    .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
     auto Dst = std::get<BufferMem>(hDstMem->Mem)
@@ -523,10 +520,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto SrcPtr =
         std::get<BufferMem>(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
     auto DstPtr =
@@ -571,10 +568,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto Dst = std::get<BufferMem>(hBuffer->Mem)
                    .getPtrWithOffset(hCommandBuffer->Device, offset);
 
@@ -611,10 +608,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto Src = std::get<BufferMem>(hBuffer->Mem)
                    .getPtrWithOffset(hCommandBuffer->Device, offset);
 
@@ -654,10 +651,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto DstPtr =
         std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     hipMemcpy3DParms NodeParams = {};
@@ -702,10 +699,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     auto SrcPtr =
         std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     hipMemcpy3DParms NodeParams = {};
@@ -749,10 +746,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     // Create an empty node if the kernel workload size is zero
     UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
                                         DepsList.data(), DepsList.size()));
@@ -788,10 +785,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
   UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
-                                        pSyncPointWaitList, DepsList));
-
   try {
+    UR_CHECK_ERROR(getNodesFromSyncPoints(
+        hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));
+
     // Create an empty node if the kernel workload size is zero
     UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
                                         DepsList.data(), DepsList.size()));
@@ -1065,7 +1062,7 @@ updateCommand(ur_exp_command_buffer_command_handle_t Command,
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     ur_exp_command_buffer_command_handle_t hCommand,
     const ur_exp_command_buffer_update_kernel_launch_desc_t
-        *pUpdateKernelLaunch) {
+        *pUpdateKernelLaunch) try {
 
   ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer;
 
@@ -1105,6 +1102,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec;
   UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params));
   return UR_RESULT_SUCCESS;
+} catch (ur_result_t Err) {
+  return Err;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp(

From efc956467c6420d3e4ed9eae29806023f8c4ff39 Mon Sep 17 00:00:00 2001
From: Agata Momot <agata.momot@intel.com>
Date: Thu, 12 Dec 2024 15:08:16 +0100
Subject: [PATCH 112/148] [benchmarks] add umf suite

---
 .github/workflows/benchmarks-reusable.yml |  21 +++
 scripts/benchmarks/benches/base.py        |   7 +-
 scripts/benchmarks/benches/options.py     |   1 +
 scripts/benchmarks/benches/umf.py         | 158 ++++++++++++++++++++++
 scripts/benchmarks/main.py                |  10 +-
 scripts/benchmarks/output_html.py         |   3 +
 6 files changed, 195 insertions(+), 5 deletions(-)
 create mode 100644 scripts/benchmarks/benches/umf.py

diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index 79cb35748e..6c00fbb04d 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -156,6 +156,26 @@ jobs:
     - name: Install UR
       run: cmake --install ${{github.workspace}}/ur_build
 
+    - name: Checkout UMF
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        repository: oneapi-src/unified-memory-framework
+        ref: main
+        path: umf-repo
+        fetch-depth: 1
+        fetch-tags: false
+
+    - name: Configure UMF
+      run: >
+        cmake -DCMAKE_BUILD_TYPE=Release
+        -S${{github.workspace}}/umf-repo
+        -B${{github.workspace}}/umf_build
+        -DUMF_BUILD_BENCHMARKS=ON
+        -DUMF_TESTS_FAIL_ON_SKIP=ON
+
+    - name: Build UMF
+      run: cmake --build ${{github.workspace}}/umf_build -j $(nproc)
+
     - name: Run benchmarks
       working-directory: ${{ github.workspace }}/ur-repo/
       id: benchmarks
@@ -164,6 +184,7 @@ jobs:
         ~/bench_workdir
         --sycl ${{ github.workspace }}/sycl_build
         --ur ${{ github.workspace }}/ur_install
+        --umf ${{ github.workspace }}/umf_build
         --adapter ${{ matrix.adapter.str_name }}
         ${{ inputs.upload_report && '--output-html' || '' }}
         ${{ inputs.bench_script_params }}
diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 31f2054d9a..abe15ca93c 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -26,7 +26,7 @@ def get_adapter_full_path():
         assert False, \
             f"could not find adapter file {adapter_path} (and in similar lib paths)"
 
-    def run_bench(self, command, env_vars, ld_library=[]):
+    def run_bench(self, command, env_vars, ld_library=[], add_sycl=True):
         env_vars_with_forced_adapter = env_vars.copy()
         if options.ur is not None:
             env_vars_with_forced_adapter.update(
@@ -35,7 +35,7 @@ def run_bench(self, command, env_vars, ld_library=[]):
         return run(
             command=command,
             env_vars=env_vars_with_forced_adapter,
-            add_sycl=True,
+            add_sycl=add_sycl,
             cwd=options.benchmark_cwd,
             ld_library=ld_library
         ).stdout.decode()
@@ -71,6 +71,9 @@ def run(self, env_vars) -> list[Result]:
     def teardown(self):
         raise NotImplementedError()
 
+    def stddev_threshold(self):
+        return None
+
 class Suite:
     def benchmarks(self) -> list[Benchmark]:
         raise NotImplementedError()
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index fa5d52ca8c..7ef7956c8e 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -12,6 +12,7 @@ class Options:
     sycl: str = None
     ur: str = None
     ur_adapter: str = None
+    umf: str = None
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
diff --git a/scripts/benchmarks/benches/umf.py b/scripts/benchmarks/benches/umf.py
new file mode 100644
index 0000000000..251cf15a93
--- /dev/null
+++ b/scripts/benchmarks/benches/umf.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import random
+from utils.utils import git_clone
+from .base import Benchmark, Suite
+from .result import Result
+from utils.utils import run, create_build_path
+from .options import options
+from .oneapi import get_oneapi
+import os
+import csv
+import io
+
+def isUMFAvailable():
+    return options.umf is not None
+
+class UMFSuite(Suite):
+    def __init__(self, directory):
+        self.directory = directory
+        if not isUMFAvailable():
+            print("UMF not provided. Related benchmarks will not run")
+    
+    def setup(self):
+        if not isUMFAvailable():
+            return []
+        self.built = True
+
+    def benchmarks(self) -> list[Benchmark]:
+        if not isUMFAvailable():
+            return
+        
+        benches = [
+            GBench(self),
+        ]
+
+        return benches
+
+class ComputeUMFBenchmark(Benchmark):
+    def __init__(self, bench, name):
+        self.bench = bench
+        self.bench_name = name
+        self.oneapi = get_oneapi()
+
+        self.col_name = None
+        self.col_iterations = None
+        self.col_real_time = None
+        self.col_cpu_time = None
+        self.col_time_unit = None
+
+        self.col_statistics_time = None
+
+        super().__init__(bench.directory)
+
+    def bin_args(self) -> list[str]:
+        return []
+
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def setup(self):
+        if not isUMFAvailable():
+            print("UMF prefix path not provided")
+            return
+
+        self.benchmark_bin = os.path.join(options.umf, 'benchmark', self.bench_name)
+
+    def run(self, env_vars) -> list[Result]:
+        command = [
+            f"{self.benchmark_bin}",
+        ]
+
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
+
+        result = self.run_bench(command, env_vars, add_sycl=False, ld_library=[self.oneapi.tbb_lib()])
+        parsed = self.parse_output(result)
+        results = []
+        for r in parsed:
+            (config, pool, mean) = r
+            label = f"{config} {pool}"
+            results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result, unit="ns", explicit_group=config))
+        return results
+
+    # Implementation with self.col_* indices could lead to the division by None
+    def get_mean(self, datarow):
+        raise NotImplementedError()
+
+    def teardown(self):
+        return
+
+class GBench(ComputeUMFBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "umf-benchmark")
+
+        self.col_name = 0
+        self.col_iterations = 1
+        self.col_real_time = 2
+        self.col_cpu_time = 3
+        self.col_time_unit = 4
+
+        self.idx_pool = 0
+        self.idx_config = 1
+        self.name_separator = '/'
+
+        self.col_statistics_time = self.col_real_time
+
+    def name(self):
+        return self.bench_name
+
+    # --benchmark_format describes stdout output
+    # --benchmark_out=<file> and --benchmark_out_format=<format>
+    # describe output to a file 
+    def bin_args(self):
+        return ["--benchmark_format=csv"]
+
+    # the default unit
+    # might be changed globally with --benchmark_time_unit={ns|us|ms|s}
+    # the change affects only benchmark where time unit has not been set
+    # explicitly
+    def unit(self):
+        return "ns"
+
+    # these benchmarks are not stable, so set this at a large value
+    def stddev_threshold(self) -> float:
+        return 0.2 # 20%
+
+    def get_pool_and_config(self, full_name):
+        list_split = full_name.split(self.name_separator, 1)
+        if len(list_split) != 2:
+            raise ValueError("Incorrect benchmark name format: ", full_name)
+        
+        return list_split[self.idx_pool], list_split[self.idx_config]
+
+    def get_mean(self, datarow):
+        return float(datarow[self.col_statistics_time])
+
+    def parse_output(self, output):
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+
+        data_row = next(reader, None)
+        if data_row is None:
+            raise ValueError("Benchmark output does not contain data.")
+
+        results = []
+        for row in reader:
+            try:
+                full_name = row[self.col_name]
+                pool, config = self.get_pool_and_config(full_name)
+                mean = self.get_mean(row)
+                results.append((config, pool, mean))
+            except KeyError as e:
+                raise ValueError(f"Error parsing output: {e}")
+
+        return results
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index ab4adafee6..1b28ec702e 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -9,6 +9,7 @@
 from benches.velocity import VelocityBench
 from benches.syclbench import *
 from benches.llamacpp import *
+from benches.umf import *
 from benches.test import TestSuite
 from benches.options import Compare, options
 from output_markdown import generate_markdown
@@ -74,7 +75,7 @@ def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) ->
 
     return new_results
 
-def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
+def process_results(results: dict[str, list[Result]], stddev_threshold_override) -> tuple[bool, list[Result]]:
     processed: list[Result] = []
     # technically, we can detect whether result is below or above threshold per
     # individual result. However, we can't repeat benchmark runs with that
@@ -94,7 +95,7 @@ def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result
         mean_value = statistics.mean(values)
         stddev = statistics.stdev(values)
 
-        threshold = options.stddev_threshold * mean_value
+        threshold = (stddev_threshold_override if stddev_threshold_override is not None else options.stddev_threshold) * mean_value
 
         if stddev > threshold:
             print(f"stddev {stddev} above the threshold {threshold} for {label}")
@@ -120,6 +121,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         VelocityBench(directory),
         SyclBench(directory),
         LlamaCppBench(directory),
+        UMFSuite(directory),
         #TestSuite()
     ] if not options.dry_run else []
 
@@ -159,7 +161,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             processed: list[Result] = []
             for _ in range(5):
                 run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
-                valid, processed = process_results(intermediate_results)
+                valid, processed = process_results(intermediate_results, benchmark.stddev_threshold())
                 if valid:
                     break
             results += processed
@@ -231,6 +233,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument('benchmark_directory', type=str, help='Working directory to setup benchmarks.')
     parser.add_argument('--sycl', type=str, help='Root directory of the SYCL compiler.', default=None)
     parser.add_argument('--ur', type=str, help='UR install prefix path', default=None)
+    parser.add_argument('--umf', type=str, help='UMF install prefix path', default=None)
     parser.add_argument('--adapter', type=str, help='Options to build the Unified Runtime as part of the benchmark', default="level_zero")
     parser.add_argument("--no-rebuild", help='Rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
@@ -267,6 +270,7 @@ def validate_and_parse_env_args(env_args):
     options.output_html = args.output_html
     options.output_markdown = args.output_markdown
     options.dry_run = args.dry_run
+    options.umf = args.umf
 
     benchmark_filter = re.compile(args.filter) if args.filter else None
 
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 8bcda148b1..7a8c4af3fb 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -157,6 +157,9 @@ def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChar
             ax.bar_label(rects, fmt='')
 
             for rect, run, res in zip(rects, run_results.keys(), run_results.values()):
+                if res is None:
+                    continue
+
                 height = rect.get_height()
                 if height > max_height:
                     max_height = height

From e9c0e96600de798a7b081ff9d8f3123a6b99ef8b Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 12 Dec 2024 16:57:12 +0100
Subject: [PATCH 113/148] [benchmarks] add ability to filter benchmarks by
 suite

---
 scripts/benchmarks/benches/base.py      |  9 ++-
 scripts/benchmarks/benches/compute.py   |  5 +-
 scripts/benchmarks/benches/llamacpp.py  |  5 +-
 scripts/benchmarks/benches/result.py    |  3 +-
 scripts/benchmarks/benches/syclbench.py |  5 +-
 scripts/benchmarks/benches/umf.py       |  5 +-
 scripts/benchmarks/benches/velocity.py  |  5 +-
 scripts/benchmarks/main.py              |  1 +
 scripts/benchmarks/output_html.py       | 98 ++++++++++++++++++-------
 9 files changed, 101 insertions(+), 35 deletions(-)

diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index abe15ca93c..38bbedd25a 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -13,8 +13,9 @@
 import tarfile
 
 class Benchmark:
-    def __init__(self, directory):
+    def __init__(self, directory, suite):
         self.directory = directory
+        self.suite = suite
 
     @staticmethod
     def get_adapter_full_path():
@@ -74,9 +75,15 @@ def teardown(self):
     def stddev_threshold(self):
         return None
 
+    def get_suite_name(self) -> str:
+        return self.suite.name()
+
 class Suite:
     def benchmarks(self) -> list[Benchmark]:
         raise NotImplementedError()
 
+    def name(self) -> str:
+        raise NotImplementedError()
+
     def setup(self):
         return
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 229a50e84d..be48acce36 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -15,6 +15,9 @@ class ComputeBench(Suite):
     def __init__(self, directory):
         self.directory = directory
 
+    def name(self) -> str:
+        return "Compute Benchmarks"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -90,10 +93,10 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
+        super().__init__(bench.directory, bench)
         self.bench = bench
         self.bench_name = name
         self.test = test
-        super().__init__(bench.directory)
 
     def bin_args(self) -> list[str]:
         return []
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 2dbdb5cbcf..8d01e2832d 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -21,6 +21,9 @@ def __init__(self, directory):
 
         self.directory = directory
 
+    def name(self) -> str:
+        return "llama.cpp bench"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -64,8 +67,8 @@ def benchmarks(self) -> list[Benchmark]:
 
 class LlamaBench(Benchmark):
     def __init__(self, bench):
+        super().__init__(bench.directory, bench)
         self.bench = bench
-        super().__init__(bench.directory)
 
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.build_path, 'bin', 'llama-bench')
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index c975fa792d..aa1459cbb3 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -25,8 +25,9 @@ class Result:
     # values below should not be set by the benchmark
     name: str = ""
     lower_is_better: bool = True
-    git_hash: str = ''
+    git_hash: str = ""
     date: Optional[datetime] = None
+    suite: str = ""
 
 @dataclass_json
 @dataclass
diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py
index 588f3ce998..1e358eb071 100644
--- a/scripts/benchmarks/benches/syclbench.py
+++ b/scripts/benchmarks/benches/syclbench.py
@@ -19,6 +19,9 @@ def __init__(self, directory):
         self.directory = directory
         return
 
+    def name(self) -> str:
+        return "SYCL-Bench"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -87,11 +90,11 @@ def benchmarks(self) -> list[Benchmark]:
 
 class SyclBenchmark(Benchmark):
     def __init__(self, bench, name, test):
+        super().__init__(bench.directory, bench)
         self.bench = bench
         self.bench_name = name
         self.test = test
         self.done = False
-        super().__init__(bench.directory)
 
     def bin_args(self) -> list[str]:
         return []
diff --git a/scripts/benchmarks/benches/umf.py b/scripts/benchmarks/benches/umf.py
index 251cf15a93..ce2575293c 100644
--- a/scripts/benchmarks/benches/umf.py
+++ b/scripts/benchmarks/benches/umf.py
@@ -22,7 +22,10 @@ def __init__(self, directory):
         self.directory = directory
         if not isUMFAvailable():
             print("UMF not provided. Related benchmarks will not run")
-    
+
+    def name(self) -> str:
+        return "UMF"
+
     def setup(self):
         if not isUMFAvailable():
             return []
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 705421d963..d22243ebeb 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -22,6 +22,9 @@ def __init__(self, directory):
 
         self.directory = directory
 
+    def name(self) -> str:
+        return "Velocity Bench"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -46,7 +49,7 @@ def benchmarks(self) -> list[Benchmark]:
 
 class VelocityBase(Benchmark):
     def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str):
-        super().__init__(vb.directory)
+        super().__init__(vb.directory, vb)
         self.vb = vb
         self.bench_name = name
         self.bin_name = bin_name
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 1b28ec702e..9d0ee2d57d 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -42,6 +42,7 @@ def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str
 
             bench_result.name = bench_result.label
             bench_result.lower_is_better = benchmark.lower_is_better()
+            bench_result.suite = benchmark.get_suite_name()
 
             if bench_result.label not in results:
                 results[bench_result.label] = []
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 7a8c4af3fb..35072a72b4 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -15,6 +15,7 @@
 @dataclass
 class BenchmarkMetadata:
     unit: str
+    suite: str
     lower_is_better: bool
 
 @dataclass
@@ -26,6 +27,7 @@ class BenchmarkSeries:
 @dataclass
 class BenchmarkChart:
     label: str
+    suite: str
     html: str
 
 def tooltip_css() -> str:
@@ -74,13 +76,6 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
                 targets=targets)
             mpld3.plugins.connect(fig, tooltip)
 
-        # This is so that the stddev doesn't fill the entire y axis on the chart
-        if all_values and all_stddevs:
-            max_value = max(all_values)
-            min_value = min(all_values)
-            max_stddev = max(all_stddevs)
-            ax.set_ylim(min_value - 3 * max_stddev, max_value + 3 * max_stddev)
-
         ax.set_title(benchmark.label, pad=20)
         performance_indicator = "lower is better" if benchmark.metadata.lower_is_better else "higher is better"
         ax.text(0.5, 1.05, f"({performance_indicator})",
@@ -98,7 +93,7 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
 
         plt.tight_layout()
-        html_charts.append(BenchmarkChart(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        html_charts.append(BenchmarkChart(html=mpld3.fig_to_html(fig), label=benchmark.label, suite=benchmark.metadata.suite))
         plt.close(fig)
 
     return html_charts
@@ -119,7 +114,7 @@ def create_explicit_groups(benchmark_runs: list[BenchmarkRun], compare_names: li
                 if res.explicit_group != '':
                     if res.explicit_group not in groups:
                         groups[res.explicit_group] = ExplicitGroup(name=res.explicit_group, nnames=len(compare_names),
-                                metadata=BenchmarkMetadata(unit=res.unit, lower_is_better=res.lower_is_better),
+                                metadata=BenchmarkMetadata(unit=res.unit, lower_is_better=res.lower_is_better, suite=res.suite),
                                 runs={})
 
                     group = groups[res.explicit_group]
@@ -207,7 +202,7 @@ def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChar
                 color='#666666')
 
         plt.tight_layout()
-        html_charts.append(BenchmarkChart(label=group.name, html=mpld3.fig_to_html(fig)))
+        html_charts.append(BenchmarkChart(label=group.name, html=mpld3.fig_to_html(fig), suite=group.metadata.suite))
         plt.close(fig)
 
     return html_charts
@@ -224,7 +219,8 @@ def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: li
             if result.label not in benchmark_metadata:
                 benchmark_metadata[result.label] = BenchmarkMetadata(
                     unit=result.unit,
-                    lower_is_better=result.lower_is_better
+                    lower_is_better=result.lower_is_better,
+                    suite=result.suite
                 )
 
             result.date = run.date
@@ -249,12 +245,15 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
     benchmarks = process_benchmark_data(benchmark_runs, compare_names)
 
     timeseries = create_time_series_chart(benchmarks, github_repo)
-    timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}"><div>{ts.html}</div></div>' for ts in timeseries)
+    timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}" data-suite="{ts.suite}"><div>{ts.html}</div></div>' for ts in timeseries)
 
     explicit_groups = create_explicit_groups(benchmark_runs, compare_names)
 
     bar_charts = create_grouped_bar_charts(explicit_groups)
-    bar_charts_html = '\n'.join(f'<div class="chart" data-label="{bc.label}"><div>{bc.html}</div></div>' for bc in bar_charts)
+    bar_charts_html = '\n'.join(f'<div class="chart" data-label="{bc.label}" data-suite="{bc.suite}"><div>{bc.html}</div></div>' for bc in bar_charts)
+
+    suite_names = {t.suite for t in timeseries}
+    suite_checkboxes_html = ' '.join(f'<label><input type="checkbox" class="suite-checkbox" data-suite="{suite}" checked> {suite}</label>' for suite in suite_names)
 
     html_template = f"""
     <!DOCTYPE html>
@@ -317,6 +316,16 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
                 width: 400px;
                 max-width: 100%;
             }}
+            .suite-filter-container {{
+                text-align: center;
+                margin-bottom: 24px;
+                padding: 16px;
+                background: #e9ecef;
+                border-radius: 8px;
+            }}
+            .suite-checkbox {{
+                margin: 0 8px;
+            }}
             details {{
                 margin-bottom: 24px;
             }}
@@ -342,46 +351,76 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
             function filterCharts() {{
                 const regexInput = document.getElementById('bench-filter').value;
                 const regex = new RegExp(regexInput, 'i');
+                const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
                 const charts = document.querySelectorAll('.chart');
-                let timeseriesVisible = false;
-                let barChartsVisible = false;
 
                 charts.forEach(chart => {{
                     const label = chart.getAttribute('data-label');
-                    if (regex.test(label)) {{
+                    const suite = chart.getAttribute('data-suite');
+                    if (regex.test(label) && activeSuites.includes(suite)) {{
                         chart.style.display = '';
-                        if (chart.closest('.timeseries')) {{
-                            timeseriesVisible = true;
-                        }} else if (chart.closest('.bar-charts')) {{
-                            barChartsVisible = true;
-                        }}
                     }} else {{
                         chart.style.display = 'none';
                     }}
                 }});
 
-                updateURL(regexInput);
-
-                document.querySelector('.timeseries').open = timeseriesVisible;
-                document.querySelector('.bar-charts').open = barChartsVisible;
+                updateURL();
             }}
 
-            function updateURL(regex) {{
+            function updateURL() {{
                 const url = new URL(window.location);
+                const regex = document.getElementById('bench-filter').value;
+                const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
+
                 if (regex) {{
                     url.searchParams.set('regex', regex);
                 }} else {{
                     url.searchParams.delete('regex');
                 }}
+
+                if (activeSuites.length > 0) {{
+                    url.searchParams.set('suites', activeSuites.join(','));
+                }} else {{
+                    url.searchParams.delete('suites');
+                }}
+
                 history.replaceState(null, '', url);
             }}
 
             document.addEventListener('DOMContentLoaded', (event) => {{
                 const regexParam = getQueryParam('regex');
+                const suitesParam = getQueryParam('suites');
+
                 if (regexParam) {{
                     document.getElementById('bench-filter').value = regexParam;
-                    filterCharts();
                 }}
+
+                const suiteCheckboxes = document.querySelectorAll('.suite-checkbox');
+                if (suitesParam) {{
+                    const suites = suitesParam.split(',');
+                    suiteCheckboxes.forEach(checkbox => {{
+                        if (suites.includes(checkbox.getAttribute('data-suite'))) {{
+                            checkbox.checked = true;
+                        }} else {{
+                            checkbox.checked = false;
+                        }}
+                    }});
+                }} else {{
+                    suiteCheckboxes.forEach(checkbox => {{
+                        checkbox.checked = true;
+                    }});
+                }}
+                filterCharts();
+
+                suiteCheckboxes.forEach(checkbox => {{
+                    checkbox.addEventListener('change', () => {{
+                        filterCharts();
+                    }});
+                }});
+
+                document.getElementById('bench-filter').addEventListener('input', () => {{
+                    filterCharts();
+                }});
             }});
         </script>
     </head>
@@ -389,7 +428,10 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
         <div class="container">
             <h1>Benchmark Results</h1>
             <div class="filter-container">
-                <input type="text" id="bench-filter" placeholder="Regex..." oninput="filterCharts()">
+                <input type="text" id="bench-filter" placeholder="Regex...">
+            </div>
+            <div class="suite-filter-container">
+                {suite_checkboxes_html}
             </div>
             <details class="timeseries">
                 <summary>Historical Results</summary>

From 69b0e1b58a2d5e64d02153d1f97908f38fc11903 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 14 Nov 2024 16:04:33 +0000
Subject: [PATCH 114/148] Make double finalizing fail

A command buffer should fail if finalize is called multiple times on the
same command buffer. This matches openCL behaviour.
---
 include/ur_api.h                                      |  1 +
 scripts/core/exp-command-buffer.yml                   |  2 ++
 source/adapters/cuda/command_buffer.cpp               |  2 ++
 source/adapters/cuda/command_buffer.hpp               |  2 +-
 source/adapters/hip/command_buffer.cpp                |  2 ++
 source/adapters/hip/command_buffer.hpp                |  2 +-
 source/adapters/level_zero/command_buffer.cpp         |  1 +
 source/adapters/opencl/command_buffer.cpp             |  1 +
 source/loader/ur_libapi.cpp                           |  1 +
 source/ur_api.cpp                                     |  1 +
 test/conformance/exp_command_buffer/commands.cpp      | 11 +++++++++++
 .../exp_command_buffer_adapter_level_zero_v2.match    |  1 +
 .../exp_command_buffer_adapter_native_cpu.match       |  1 +
 13 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 7922b53d6c..160cd7e8fa 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -8517,6 +8517,7 @@ urCommandBufferReleaseExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION - "If `hCommandBuffer` has already been finalized"
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 UR_APIEXPORT ur_result_t UR_APICALL
diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml
index 9708059b0a..7ca4c957d5 100644
--- a/scripts/core/exp-command-buffer.yml
+++ b/scripts/core/exp-command-buffer.yml
@@ -330,6 +330,8 @@ params:
       desc: "[in] Handle of the command-buffer object."
 returns:
     - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP
+    - $X_RESULT_ERROR_INVALID_OPERATION
+        - "If `hCommandBuffer` has already been finalized"
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index b60d2944b1..a799f7cf8a 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -401,6 +401,8 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+  UR_ASSERT(hCommandBuffer->CudaGraphExec == nullptr,
+            UR_RESULT_ERROR_INVALID_OPERATION);
   try {
     const unsigned long long flags = 0;
 #if CUDA_VERSION >= 12000
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index d2403a4ab3..67d725c3ad 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -355,7 +355,7 @@ struct ur_exp_command_buffer_handle_t_ {
   // Cuda Graph handle
   CUgraph CudaGraph;
   // Cuda Graph Exec handle
-  CUgraphExec CudaGraphExec;
+  CUgraphExec CudaGraphExec = nullptr;
   // Atomic variable counting the number of reference to this command_buffer
   // using std::atomic prevents data race when incrementing/decrementing.
   std::atomic_uint32_t RefCountInternal;
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 538c2ff85a..75c338914b 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -306,6 +306,8 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+  UR_ASSERT(hCommandBuffer->HIPGraphExec == nullptr,
+            UR_RESULT_ERROR_INVALID_OPERATION);
   try {
     const unsigned long long flags = 0;
     UR_CHECK_ERROR(hipGraphInstantiateWithFlags(
diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp
index e162b8e640..a236a32c24 100644
--- a/source/adapters/hip/command_buffer.hpp
+++ b/source/adapters/hip/command_buffer.hpp
@@ -175,7 +175,7 @@ struct ur_exp_command_buffer_handle_t_ {
   // HIP Graph handle
   hipGraph_t HIPGraph;
   // HIP Graph Exec handle
-  hipGraphExec_t HIPGraphExec;
+  hipGraphExec_t HIPGraphExec = nullptr;
   // Atomic variable counting the number of reference to this command_buffer
   // using std::atomic prevents data race when incrementing/decrementing.
   std::atomic_uint32_t RefCountInternal;
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 5ae19092a6..01c682c770 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -865,6 +865,7 @@ finalizeWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer) {
 ur_result_t
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
   UR_ASSERT(CommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(!CommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION);
 
   // It is not allowed to append to command list from multiple threads.
   std::scoped_lock<ur_shared_mutex> Guard(CommandBuffer->Mutex);
diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp
index a161a5b32b..ac23765f0f 100644
--- a/source/adapters/opencl/command_buffer.cpp
+++ b/source/adapters/opencl/command_buffer.cpp
@@ -124,6 +124,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
+  UR_ASSERT(!hCommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION);
   cl_context CLContext = cl_adapter::cast<cl_context>(hCommandBuffer->hContext);
   cl_ext::clFinalizeCommandBufferKHR_fn clFinalizeCommandBufferKHR = nullptr;
   UR_RETURN_ON_FAILURE(
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 15c78956d8..b209dc1ea6 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -7547,6 +7547,7 @@ ur_result_t UR_APICALL urCommandBufferReleaseExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION - "If `hCommandBuffer` has already been finalized"
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urCommandBufferFinalizeExp(
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 3925d5d160..b9c67b42c6 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -6410,6 +6410,7 @@ ur_result_t UR_APICALL urCommandBufferReleaseExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION - "If `hCommandBuffer` has already been finalized"
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urCommandBufferFinalizeExp(
diff --git a/test/conformance/exp_command_buffer/commands.cpp b/test/conformance/exp_command_buffer/commands.cpp
index 49b2444176..5c4d4e7e4d 100644
--- a/test/conformance/exp_command_buffer/commands.cpp
+++ b/test/conformance/exp_command_buffer/commands.cpp
@@ -204,3 +204,14 @@ TEST_P(urCommandBufferAppendKernelLaunchExpTest, Basic) {
         ASSERT_EQ(result, ptrZ[i]);
     }
 }
+
+TEST_P(urCommandBufferAppendKernelLaunchExpTest, FinalizeTwice) {
+    ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+        cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size,
+        &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr,
+        nullptr));
+
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+    EXPECT_EQ_RESULT(urCommandBufferFinalizeExp(cmd_buf_handle),
+                     UR_RESULT_ERROR_INVALID_OPERATION);
+}
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
index 5aa63f1cbc..5cd3d2a0ff 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
@@ -14,6 +14,7 @@ urCommandBufferCommandsTest.urCommandBufferAppendMemBufferFillExp/*
 urCommandBufferCommandsTest.urCommandBufferAppendUSMPrefetchExp/*
 urCommandBufferCommandsTest.urCommandBufferAppendUSMAdviseExp/*
 urCommandBufferAppendKernelLaunchExpTest.Basic/*
+urCommandBufferAppendKernelLaunchExpTest.FinalizeTwice/*
 urCommandBufferFillCommandsTest.Buffer/*
 urCommandBufferFillCommandsTest.USM/*
 KernelCommandEventSyncTest.Basic/*
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index 3588eaea82..d6dc9a975c 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -5,6 +5,7 @@
 {{OPT}}urCommandBufferRetainCommandExpTest.Success/*
 {{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/*
 {{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/*
+{{OPT}}urCommandBufferAppendKernelLaunchExpTest.FinalizeTwice/*
 {{OPT}}BufferFillCommandTest.UpdateParameters/*
 {{OPT}}BufferFillCommandTest.UpdateGlobalSize/*
 {{OPT}}BufferFillCommandTest.SeparateUpdateCalls/*

From 2e38d5cf77f1e3154fffe60912eae427e86b2e1a Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Thu, 12 Dec 2024 18:43:07 +0000
Subject: [PATCH 115/148] [Benchmarks] fix running benchmarks when umf dir is
 not specified

benchmarks += s.benchmarks() was failing since benchmarks returned None
---
 scripts/benchmarks/benches/umf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benches/umf.py b/scripts/benchmarks/benches/umf.py
index 251cf15a93..a7978f555d 100644
--- a/scripts/benchmarks/benches/umf.py
+++ b/scripts/benchmarks/benches/umf.py
@@ -30,7 +30,7 @@ def setup(self):
 
     def benchmarks(self) -> list[Benchmark]:
         if not isUMFAvailable():
-            return
+            return []
         
         benches = [
             GBench(self),

From 3756b45fde2732510146a5ab955baaef91d20d1b Mon Sep 17 00:00:00 2001
From: "Zhang, Winston" <winston.zhang@intel.com>
Date: Thu, 12 Dec 2024 23:34:38 +0000
Subject: [PATCH 116/148] [L0] Double free during urEventReleaseInternal

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
---
 source/adapters/level_zero/event.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index a0b3dcd328..eae16f0c57 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -1114,7 +1114,6 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   // enabled or not, so we access properties of the queue and that's why queue
   // must released later.
   if (DisableEventsCaching || !Event->OwnNativeHandle) {
-    ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
     delete Event;
   } else {
     Event->Context->addEventToContextCache(Event);

From 75a82aa9250d6609dacb18c0b887f19f3f76b5bc Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Fri, 13 Dec 2024 08:09:54 +0100
Subject: [PATCH 117/148] fix build

---
 source/loader/layers/sanitizer/asan/asan_interceptor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index ad3beda007..fb17b0a7f5 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -836,8 +836,9 @@ ur_result_t AsanInterceptor::prepareLaunch(
 
     getContext()->logger.info(
         "LaunchInfo {} (device={}, debug={}, numLocalArgs={}, localArgs={})",
-        (void *)LaunchInfo.Data.getDevicePtr(), LaunchInfo.Data.Host.DeviceTy,
-        LaunchInfo.Data.Host.Debug, LaunchInfo.Data.Host.NumLocalArgs,
+        (void *)LaunchInfo.Data.getDevicePtr(),
+        ToString(LaunchInfo.Data.Host.DeviceTy), LaunchInfo.Data.Host.Debug,
+        LaunchInfo.Data.Host.NumLocalArgs,
         (void *)LaunchInfo.Data.Host.LocalArgs);
 
     return UR_RESULT_SUCCESS;

From f10473a9a268964e07153ca13423a291506e0579 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Fri, 13 Dec 2024 08:14:05 +0100
Subject: [PATCH 118/148] register globals

---
 .../loader/layers/sanitizer/asan/asan_interceptor.cpp  | 10 ++++++++++
 .../loader/layers/sanitizer/asan/asan_interceptor.hpp  |  1 +
 2 files changed, 11 insertions(+)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 19af8546c2..02dcb3d0d3 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -431,6 +431,12 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
     auto ProgramInfo = getProgramInfo(Program);
     assert(ProgramInfo != nullptr && "unregistered program!");
 
+    std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
+    for (auto AI : ProgramInfo->AllocInfoForGlobals) {
+        m_AllocationMap.erase(AI->AllocBegin);
+    }
+    ProgramInfo->AllocInfoForGlobals.clear();
+
     ProgramInfo->InstrumentedKernels.clear();
 
     return UR_RESULT_SUCCESS;
@@ -549,6 +555,10 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
                           {}});
 
             ContextInfo->insertAllocInfo({Device}, AI);
+            ProgramInfo->AllocInfoForGlobals.emplace(AI);
+
+            std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
+            m_AllocationMap.emplace(AI->AllocBegin, std::move(AI));
         }
     }
 
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index f1e80dae56..2270795969 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -112,6 +112,7 @@ struct ProgramInfo {
     std::atomic<int32_t> RefCount = 1;
 
     // Program is built only once, so we don't need to lock it
+    std::unordered_set<std::shared_ptr<AllocInfo>> AllocInfoForGlobals;
     std::unordered_set<std::string> InstrumentedKernels;
 
     explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) {

From 9861d4109950c12caa1e67466d536813dbc92d20 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 21 Oct 2024 11:15:09 +0100
Subject: [PATCH 119/148] Bump and synchronise versions of DPCPP used by CI

change updates those URLs so all testing is done with the same DPCPP
version.
---
 .github/docker/install_dpcpp.sh           | 2 +-
 .github/workflows/build-fuzz-reusable.yml | 2 +-
 .github/workflows/build-hw-reusable.yml   | 2 +-
 .github/workflows/cmake.yml               | 2 +-
 .github/workflows/multi_device.yml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh
index 87548a5b64..32df1a5d9d 100755
--- a/.github/docker/install_dpcpp.sh
+++ b/.github/docker/install_dpcpp.sh
@@ -16,5 +16,5 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then
 fi
 
 mkdir -p ${DPCPP_PATH}/dpcpp_compiler
-wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
+wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
 tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler
diff --git a/.github/workflows/build-fuzz-reusable.yml b/.github/workflows/build-fuzz-reusable.yml
index 2cbd1b87ff..b7ba83f3b9 100644
--- a/.github/workflows/build-fuzz-reusable.yml
+++ b/.github/workflows/build-fuzz-reusable.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 
diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml
index 3e332c73fc..6ceda23a0e 100644
--- a/.github/workflows/build-hw-reusable.yml
+++ b/.github/workflows/build-hw-reusable.yml
@@ -77,7 +77,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 0a4ae99a58..1a2e588bb0 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -76,7 +76,7 @@ jobs:
       if: matrix.os == 'ubuntu-22.04'
       run: |
         sudo apt install libncurses5
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
         mkdir -p ${{github.workspace}}/dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler
 
diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml
index 2abc32cb1e..83f64029ff 100644
--- a/.github/workflows/multi_device.yml
+++ b/.github/workflows/multi_device.yml
@@ -33,7 +33,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 

From 19237047ac004f0ccd456d43679261dadf6fa454 Mon Sep 17 00:00:00 2001
From: Isaac Ault <isaac.ault@codeplay.com>
Date: Fri, 13 Dec 2024 10:15:27 +0000
Subject: [PATCH 120/148] Include DPC++ libs in compilation of device code.

---
 test/conformance/device_code/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index 1621b01544..a59cbae7b0 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -109,9 +109,11 @@ macro(add_device_binary SOURCE_FILE)
         endif()
 
         add_custom_command(OUTPUT ${BIN_PATH}
-            COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off 
-            ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB}
-            ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH}
+            COMMAND LD_LIBRARY_PATH=${UR_SYCL_LIBRARY_DIR}:$ENV{LD_LIBRARY_PATH} 
+            ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} 
+            -fsycl-device-code-split=off ${AMD_TARGET_BACKEND} 
+            ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB} ${DPCXX_BUILD_FLAGS_LIST} 
+            ${SOURCE_FILE} -o ${EXE_PATH}
 
             COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} ${UR_DEVICE_CODE_EXTRACTOR} --stem="${TRIPLE}.bin" ${EXE_PATH}
 

From 52a14bdac099877c176c9877546c99744601ba48 Mon Sep 17 00:00:00 2001
From: Isaac Ault <isaac.ault@codeplay.com>
Date: Fri, 13 Dec 2024 11:15:53 +0000
Subject: [PATCH 121/148] Update to 12/12/2024

---
 .github/docker/install_dpcpp.sh           | 2 +-
 .github/workflows/build-fuzz-reusable.yml | 2 +-
 .github/workflows/build-hw-reusable.yml   | 2 +-
 .github/workflows/cmake.yml               | 2 +-
 .github/workflows/multi_device.yml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh
index 32df1a5d9d..defb597fb1 100755
--- a/.github/docker/install_dpcpp.sh
+++ b/.github/docker/install_dpcpp.sh
@@ -16,5 +16,5 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then
 fi
 
 mkdir -p ${DPCPP_PATH}/dpcpp_compiler
-wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
+wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz
 tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler
diff --git a/.github/workflows/build-fuzz-reusable.yml b/.github/workflows/build-fuzz-reusable.yml
index b7ba83f3b9..e46df87c94 100644
--- a/.github/workflows/build-fuzz-reusable.yml
+++ b/.github/workflows/build-fuzz-reusable.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 
diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml
index 6ceda23a0e..3c791db161 100644
--- a/.github/workflows/build-hw-reusable.yml
+++ b/.github/workflows/build-hw-reusable.yml
@@ -77,7 +77,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 1a2e588bb0..2bd46c4e98 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -76,7 +76,7 @@ jobs:
       if: matrix.os == 'ubuntu-22.04'
       run: |
         sudo apt install libncurses5
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz
         mkdir -p ${{github.workspace}}/dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler
 
diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml
index 83f64029ff..5334e86b87 100644
--- a/.github/workflows/multi_device.yml
+++ b/.github/workflows/multi_device.yml
@@ -33,7 +33,7 @@ jobs:
 
     - name: Download DPC++
       run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-10-23/sycl_linux.tar.gz
+        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-12-12/sycl_linux.tar.gz
         mkdir dpcpp_compiler
         tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
 

From fcddf077c290e33118930eca30a5ab8494fb1293 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 12 Dec 2024 10:10:06 +0000
Subject: [PATCH 122/148] Fix L0 command-buffer consumption of multi-device
 kernels

UR program and kernel objects can be tied to multiple devices,
a UR command-buffer object however is tied to a single device.

When appending a kernel command to a command-buffer, select the
correct single-device `ze_kernel_handle_t` object from the multi-device
`ur_kernel_handle_t` object
---
 source/adapters/level_zero/command_buffer.cpp |  73 +++++----
 .../program_adapter_level_zero_v2.match       |   1 +
 .../urMultiDeviceProgramCreateWithBinary.cpp  | 138 ++++++++++++++++++
 3 files changed, 187 insertions(+), 25 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 5ae19092a6..32eff7e141 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -894,28 +894,31 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
 /**
  * Sets the kernel arguments for a kernel command that will be appended to the
  * command buffer.
- * @param[in] CommandBuffer The CommandBuffer where the command will be
+ * @param[in] Device The Device associated with the command-buffer where the
+ * kernel command will be appended.
+ * @param[in,out] Arguments stored in the ur_kernel_handle_t object to be set
+ * on the /p ZeKernel object.
+ * @param[in] ZeKernel The handle to the Level-Zero kernel that will be
  * appended.
- * @param[in] Kernel The handle to the kernel that will be appended.
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
-ur_result_t
-setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer,
-                          ur_kernel_handle_t Kernel) {
-
+ur_result_t setKernelPendingArguments(
+    ur_device_handle_t Device,
+    std::vector<ur_kernel_handle_t_::ArgumentInfo> &PendingArguments,
+    ze_kernel_handle_t ZeKernel) {
   // If there are any pending arguments set them now.
-  for (auto &Arg : Kernel->PendingArguments) {
+  for (auto &Arg : PendingArguments) {
     // The ArgValue may be a NULL pointer in which case a NULL value is used for
     // the kernel argument declared as a pointer to global or constant memory.
     char **ZeHandlePtr = nullptr;
     if (Arg.Value) {
-      UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
-                                        CommandBuffer->Device, nullptr, 0u));
+      UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, Device,
+                                        nullptr, 0u));
     }
     ZE2UR_CALL(zeKernelSetArgumentValue,
-               (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
+               (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
   }
-  Kernel->PendingArguments.clear();
+  PendingArguments.clear();
 
   return UR_RESULT_SUCCESS;
 }
@@ -951,6 +954,8 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
                                ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET;
 
   auto Platform = CommandBuffer->Context->getPlatform();
+  auto ZeDevice = CommandBuffer->Device->ZeDevice;
+
   if (NumKernelAlternatives > 0) {
     ZeMutableCommandDesc.flags |=
         ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION;
@@ -958,14 +963,20 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
     std::vector<ze_kernel_handle_t> TranslatedKernelHandles(
         NumKernelAlternatives + 1, nullptr);
 
+    ze_kernel_handle_t ZeMainKernel{};
+    UR_CALL(getZeKernel(ZeDevice, Kernel, &ZeMainKernel));
+
     // Translate main kernel first
     ZE2UR_CALL(zelLoaderTranslateHandle,
-               (ZEL_HANDLE_KERNEL, Kernel->ZeKernel,
+               (ZEL_HANDLE_KERNEL, ZeMainKernel,
                 (void **)&TranslatedKernelHandles[0]));
 
     for (size_t i = 0; i < NumKernelAlternatives; i++) {
+      ze_kernel_handle_t ZeAltKernel{};
+      UR_CALL(getZeKernel(ZeDevice, KernelAlternatives[i], &ZeAltKernel));
+
       ZE2UR_CALL(zelLoaderTranslateHandle,
-                 (ZEL_HANDLE_KERNEL, KernelAlternatives[i]->ZeKernel,
+                 (ZEL_HANDLE_KERNEL, ZeAltKernel,
                   (void **)&TranslatedKernelHandles[i + 1]));
     }
 
@@ -1022,23 +1033,28 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
   std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
       Kernel->Mutex, Kernel->Program->Mutex, CommandBuffer->Mutex);
 
+  auto Device = CommandBuffer->Device;
+  ze_kernel_handle_t ZeKernel{};
+  UR_CALL(getZeKernel(Device->ZeDevice, Kernel, &ZeKernel));
+
   if (GlobalWorkOffset != NULL) {
-    UR_CALL(setKernelGlobalOffset(CommandBuffer->Context, Kernel->ZeKernel,
-                                  WorkDim, GlobalWorkOffset));
+    UR_CALL(setKernelGlobalOffset(CommandBuffer->Context, ZeKernel, WorkDim,
+                                  GlobalWorkOffset));
   }
 
   // If there are any pending arguments set them now.
   if (!Kernel->PendingArguments.empty()) {
-    UR_CALL(setKernelPendingArguments(CommandBuffer, Kernel));
+    UR_CALL(
+        setKernelPendingArguments(Device, Kernel->PendingArguments, ZeKernel));
   }
 
   ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
   uint32_t WG[3];
-  UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, CommandBuffer->Device,
+  UR_CALL(calculateKernelWorkDimensions(ZeKernel, Device,
                                         ZeThreadGroupDimensions, WG, WorkDim,
                                         GlobalWorkSize, LocalWorkSize));
 
-  ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
+  ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2]));
 
   CommandBuffer->KernelsList.push_back(Kernel);
   for (size_t i = 0; i < NumKernelAlternatives; i++) {
@@ -1063,7 +1079,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
       SyncPointWaitList, false, RetSyncPoint, ZeEventList, ZeLaunchEvent));
 
   ZE2UR_CALL(zeCommandListAppendLaunchKernel,
-             (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel,
+             (CommandBuffer->ZeComputeCommandList, ZeKernel,
               &ZeThreadGroupDimensions, ZeLaunchEvent, ZeEventList.size(),
               getPointerFromVector(ZeEventList)));
 
@@ -1836,6 +1852,7 @@ ur_result_t updateKernelCommand(
   const auto CommandBuffer = Command->CommandBuffer;
   const void *NextDesc = nullptr;
   auto Platform = CommandBuffer->Context->getPlatform();
+  auto ZeDevice = CommandBuffer->Device->ZeDevice;
 
   uint32_t Dim = CommandDesc->newWorkDim;
   size_t *NewGlobalWorkOffset = CommandDesc->pNewGlobalWorkOffset;
@@ -1844,11 +1861,14 @@ ur_result_t updateKernelCommand(
 
   // Kernel handle must be updated first for a given CommandId if required
   ur_kernel_handle_t NewKernel = CommandDesc->hNewKernel;
+
   if (NewKernel && Command->Kernel != NewKernel) {
+    ze_kernel_handle_t ZeNewKernel{};
+    UR_CALL(getZeKernel(ZeDevice, NewKernel, &ZeNewKernel));
+
     ze_kernel_handle_t ZeKernelTranslated = nullptr;
-    ZE2UR_CALL(
-        zelLoaderTranslateHandle,
-        (ZEL_HANDLE_KERNEL, NewKernel->ZeKernel, (void **)&ZeKernelTranslated));
+    ZE2UR_CALL(zelLoaderTranslateHandle,
+               (ZEL_HANDLE_KERNEL, ZeNewKernel, (void **)&ZeKernelTranslated));
 
     ZE2UR_CALL(Platform->ZeMutableCmdListExt
                    .zexCommandListUpdateMutableCommandKernelsExp,
@@ -1905,10 +1925,13 @@ ur_result_t updateKernelCommand(
     // by the driver for the kernel.
     bool UpdateWGSize = NewLocalWorkSize == nullptr;
 
+    ze_kernel_handle_t ZeKernel{};
+    UR_CALL(getZeKernel(ZeDevice, Command->Kernel, &ZeKernel));
+
     uint32_t WG[3];
-    UR_CALL(calculateKernelWorkDimensions(
-        Command->Kernel->ZeKernel, CommandBuffer->Device,
-        ZeThreadGroupDimensions, WG, Dim, NewGlobalWorkSize, NewLocalWorkSize));
+    UR_CALL(calculateKernelWorkDimensions(ZeKernel, CommandBuffer->Device,
+                                          ZeThreadGroupDimensions, WG, Dim,
+                                          NewGlobalWorkSize, NewLocalWorkSize));
 
     auto MutableGroupCountDesc =
         std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
diff --git a/test/conformance/program/program_adapter_level_zero_v2.match b/test/conformance/program/program_adapter_level_zero_v2.match
index 97d6869b81..fd359b3653 100644
--- a/test/conformance/program/program_adapter_level_zero_v2.match
+++ b/test/conformance/program/program_adapter_level_zero_v2.match
@@ -1,3 +1,4 @@
 urProgramSetSpecializationConstantsTest.InvalidValueSize/*
 urProgramSetSpecializationConstantsTest.InvalidValueId/*
 urProgramSetSpecializationConstantsTest.InvalidValuePtr/*
+{{OPT}}urMultiDeviceCommandBufferExpTest.*
diff --git a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
index 9ff11d9016..5f99747462 100644
--- a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
+++ b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
@@ -240,3 +240,141 @@ TEST_F(urMultiDeviceProgramCreateWithBinaryTest, CheckProgramGetInfo) {
         reinterpret_cast<char *>(property_value.data());
     ASSERT_STRNE(returned_kernel_names, "");
 }
+
+struct urMultiDeviceCommandBufferExpTest
+    : urMultiDeviceProgramCreateWithBinaryTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urMultiDeviceProgramCreateWithBinaryTest::SetUp());
+
+        auto kernelName =
+            uur::KernelsEnvironment::instance->GetEntryPointNames("foo")[0];
+
+        ASSERT_SUCCESS(urProgramBuild(context, binary_program, nullptr));
+        ASSERT_SUCCESS(
+            urKernelCreate(binary_program, kernelName.data(), &kernel));
+    }
+
+    void TearDown() override {
+        if (kernel) {
+            EXPECT_SUCCESS(urKernelRelease(kernel));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urMultiDeviceProgramCreateWithBinaryTest::TearDown());
+    }
+
+    static bool hasCommandBufferSupport(ur_device_handle_t device) {
+        ur_bool_t cmd_buffer_support = false;
+        auto res = urDeviceGetInfo(
+            device, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP,
+            sizeof(cmd_buffer_support), &cmd_buffer_support, nullptr);
+
+        if (res) {
+            return false;
+        }
+
+        return cmd_buffer_support;
+    }
+
+    static bool hasCommandBufferUpdateSupport(ur_device_handle_t device) {
+        ur_device_command_buffer_update_capability_flags_t
+            update_capability_flags;
+        auto res = urDeviceGetInfo(
+            device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP,
+            sizeof(update_capability_flags), &update_capability_flags, nullptr);
+
+        if (res) {
+            return false;
+        }
+
+        return (0 != update_capability_flags);
+    }
+
+    ur_kernel_handle_t kernel = nullptr;
+
+    static constexpr size_t global_offset = 0;
+    static constexpr size_t n_dimensions = 1;
+    static constexpr size_t global_size = 64;
+    static constexpr size_t local_size = 4;
+};
+
+TEST_F(urMultiDeviceCommandBufferExpTest, Enqueue) {
+    for (size_t i = 0; i < devices.size(); i++) {
+        auto device = devices[i];
+        if (!hasCommandBufferSupport(device)) {
+            continue;
+        }
+
+        // Create command-buffer
+        uur::raii::CommandBuffer cmd_buf_handle;
+        ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr,
+                                                cmd_buf_handle.ptr()));
+
+        // Append kernel command to command-buffer and close command-buffer
+        ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+            cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size,
+            &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr,
+            nullptr));
+        ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+
+        // Verify execution succeeds
+        ASSERT_SUCCESS(urCommandBufferEnqueueExp(cmd_buf_handle, queues[i], 0,
+                                                 nullptr, nullptr));
+        ASSERT_SUCCESS(urQueueFinish(queues[i]));
+    }
+}
+
+TEST_F(urMultiDeviceCommandBufferExpTest, Update) {
+    for (size_t i = 0; i < devices.size(); i++) {
+        auto device = devices[i];
+        if (!(hasCommandBufferSupport(device) &&
+              hasCommandBufferUpdateSupport(device))) {
+            continue;
+        }
+
+        // Create a command-buffer with update enabled.
+        ur_exp_command_buffer_desc_t desc{
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true, false,
+            false};
+
+        // Create command-buffer
+        uur::raii::CommandBuffer cmd_buf_handle;
+        ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc,
+                                                cmd_buf_handle.ptr()));
+
+        // Append kernel command to command-buffer and close command-buffer
+        uur::raii::CommandBufferCommand command;
+        ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+            cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size,
+            &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr,
+            command.ptr()));
+        ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+
+        // Verify execution succeeds
+        ASSERT_SUCCESS(urCommandBufferEnqueueExp(cmd_buf_handle, queues[i], 0,
+                                                 nullptr, nullptr));
+        ASSERT_SUCCESS(urQueueFinish(queues[i]));
+
+        // Update kernel and enqueue command-buffer again
+        ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+            UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+            nullptr,      // pNext
+            kernel,       // hNewKernel
+            0,            // numNewMemObjArgs
+            0,            // numNewPointerArgs
+            0,            // numNewValueArgs
+            n_dimensions, // newWorkDim
+            nullptr,      // pNewMemObjArgList
+            nullptr,      // pNewPointerArgList
+            nullptr,      // pNewValueArgList
+            nullptr,      // pNewGlobalWorkOffset
+            nullptr,      // pNewGlobalWorkSize
+            nullptr,      // pNewLocalWorkSize
+        };
+        ASSERT_SUCCESS(
+            urCommandBufferUpdateKernelLaunchExp(command, &update_desc));
+        ASSERT_SUCCESS(urCommandBufferEnqueueExp(cmd_buf_handle, queues[i], 0,
+                                                 nullptr, nullptr));
+        ASSERT_SUCCESS(urQueueFinish(queues[i]));
+    }
+}

From f9efc51198e5baaad7e84e17f2ec066234ca66a6 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Tue, 10 Dec 2024 14:56:15 +0100
Subject: [PATCH 123/148] [benchmarks] pin benchmarks to specific cores

Currently we use numactl to pin the benchmark scripts to
an entire numa node. However, after some testing, this
proved to be insufficient to make all the benchmarks stable.

This patch will change the workflow to now use taskset
to pin the benchmark script to physical cores of the
first numa node, with the exception for the first four,
which are more likely to be used by the kernel for
bookkeeping work or e.g., handling interrupts.
---
 .github/workflows/benchmarks-reusable.yml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index 6c00fbb04d..cd2edf902f 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -179,8 +179,18 @@ jobs:
     - name: Run benchmarks
       working-directory: ${{ github.workspace }}/ur-repo/
       id: benchmarks
-      run: >
-        numactl -N 0 ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py
+      run: |
+        # Compute the core range for the first NUMA node, skipping the first 4 cores.
+        # This is to avoid the first cores that the kernel is likely to schedule more work on.
+        CORES=$(lscpu | awk '
+          /NUMA node0 CPU|On-line CPU/ {line=$0}
+          END {
+            split(line, a, " ")
+            split(a[4], b, ",")
+            sub(/^0/, "4", b[1])
+            print b[1]
+          }')
+        taskset -c $CORES ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py
         ~/bench_workdir
         --sycl ${{ github.workspace }}/sycl_build
         --ur ${{ github.workspace }}/ur_install

From cdca50548253ca8e4aca794a9ab0fa798f0a0b8e Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Fri, 6 Dec 2024 11:14:42 +0000
Subject: [PATCH 124/148] Bump version to v0.12.0

---
 CMakeLists.txt                           | 2 +-
 include/ur_api.h                         | 5 +++--
 include/ur_api_funcs.def                 | 2 +-
 include/ur_ddi.h                         | 2 +-
 include/ur_print.hpp                     | 2 +-
 scripts/Doxyfile                         | 2 +-
 scripts/core/platform.yml                | 3 +++
 scripts/parse_specs.py                   | 6 ++++--
 source/ur_api.cpp                        | 2 +-
 test/tools/urtrace/mock_hello_json.match | 2 +-
 10 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3330ce31b..311c9c487a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
-project(unified-runtime VERSION 0.11.0)
+project(unified-runtime VERSION 0.12.0)
 
 # Check if unified runtime is built as a standalone project.
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR UR_STANDALONE_BUILD)
diff --git a/include/ur_api.h b/include/ur_api.h
index 36e6c29e68..4c97141597 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_api.h
- * @version v0.11-r0
+ * @version v0.12-r0
  *
  */
 #ifndef UR_API_H_INCLUDED
@@ -1156,7 +1156,8 @@ typedef enum ur_api_version_t {
     UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9),      ///< version 0.9
     UR_API_VERSION_0_10 = UR_MAKE_VERSION(0, 10),    ///< version 0.10
     UR_API_VERSION_0_11 = UR_MAKE_VERSION(0, 11),    ///< version 0.11
-    UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 11), ///< latest known version
+    UR_API_VERSION_0_12 = UR_MAKE_VERSION(0, 12),    ///< version 0.12
+    UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 12), ///< latest known version
     /// @cond
     UR_API_VERSION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def
index 96d8e0caf4..b841fd6d7b 100644
--- a/include/ur_api_funcs.def
+++ b/include/ur_api_funcs.def
@@ -8,7 +8,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_api_funcs.def
- * @version v0.11-r0
+ * @version v0.12-r0
  *
  */
 
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index eeb323fc58..0f8dfabc67 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_ddi.h
- * @version v0.11-r0
+ * @version v0.12-r0
  *
  */
 #ifndef UR_DDI_H_INCLUDED
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 5255a20f78..fec36d090a 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_print.hpp
- * @version v0.11-r0
+ * @version v0.12-r0
  *
  */
 #ifndef UR_PRINT_HPP
diff --git a/scripts/Doxyfile b/scripts/Doxyfile
index 0da9dfb918..cc9aee53c2 100644
--- a/scripts/Doxyfile
+++ b/scripts/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Intel One API Unified Runtime API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = v0.11
+PROJECT_NUMBER         = v0.12
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml
index 997f4918ee..2a2107b711 100644
--- a/scripts/core/platform.yml
+++ b/scripts/core/platform.yml
@@ -148,6 +148,9 @@ etors:
     - name: "0_11"
       value: "$X_MAKE_VERSION( 0, 11 )"
       desc: "version 0.11"
+    - name: "0_12"
+      value: "$X_MAKE_VERSION( 0, 12 )"
+      desc: "version 0.12"
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Returns the API version supported by the specified platform"
diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py
index fe5cbe2027..42db292c68 100644
--- a/scripts/parse_specs.py
+++ b/scripts/parse_specs.py
@@ -21,8 +21,10 @@
 from version import Version
 
 
-default_version = Version("0.11")
-all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10", "0.11"]]
+default_version = Version("0.12")
+all_versions = [
+    Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10", "0.11", "0.12"]
+]
 
 """
     preprocess object
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 444170c71d..bba4c2a741 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_api.cpp
- * @version v0.11-r0
+ * @version v0.12-r0
  *
  */
 #include "ur_api.h"
diff --git a/test/tools/urtrace/mock_hello_json.match b/test/tools/urtrace/mock_hello_json.match
index 46bf1ddd30..eb6d61a854 100644
--- a/test/tools/urtrace/mock_hello_json.match
+++ b/test/tools/urtrace/mock_hello_json.match
@@ -5,7 +5,7 @@ Platform initialized.
 {            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urAdapterGet",            "args": "(.NumEntries = 1, .phAdapters = {{.*}} {{{.*}}}, .pNumAdapters = nullptr)"        },
 {            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urPlatformGet",            "args": "(.phAdapters = {{.*}} {{{.*}}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = nullptr, .pNumPlatforms = {{.*}} (1))"        },
 {            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urPlatformGet",            "args": "(.phAdapters = {{.*}} {{{.*}}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}} {{{.*}}}, .pNumPlatforms = nullptr)"        },
-{            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urPlatformGetApiVersion",            "args": "(.hPlatform = {{.*}}, .pVersion = {{.*}} (0.11))"        },
+{            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urPlatformGetApiVersion",            "args": "(.hPlatform = {{.*}}, .pVersion = {{.*}} (0.12))"        },
 API version: @PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@
 {            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urDeviceGet",            "args": "(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 0, .phDevices = nullptr, .pNumDevices = {{.*}} (1))"        },
 {            "cat": "UR",             "ph": "X",            "pid": {{.*}},            "tid": {{.*}},            "ts": {{.*}},            "dur": {{.*}},            "name": "urDeviceGet",            "args": "(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 1, .phDevices = {{.*}} {{{.*}}}, .pNumDevices = nullptr)"        },

From 569f4f2bb756feec20be948ae5832d594a17e8f6 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Fri, 13 Dec 2024 13:26:09 +0100
Subject: [PATCH 125/148] [benchmarks] fix umf suite after recent changes

---
 .github/workflows/benchmarks-reusable.yml | 12 ++++++++----
 scripts/benchmarks/benches/umf.py         |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index cd2edf902f..46bdecb4ca 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -176,9 +176,7 @@ jobs:
     - name: Build UMF
       run: cmake --build ${{github.workspace}}/umf_build -j $(nproc)
 
-    - name: Run benchmarks
-      working-directory: ${{ github.workspace }}/ur-repo/
-      id: benchmarks
+    - name: Compute core range
       run: |
         # Compute the core range for the first NUMA node, skipping the first 4 cores.
         # This is to avoid the first cores that the kernel is likely to schedule more work on.
@@ -190,7 +188,13 @@ jobs:
             sub(/^0/, "4", b[1])
             print b[1]
           }')
-        taskset -c $CORES ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py
+        echo "CORES=$CORES" >> $GITHUB_ENV
+
+    - name: Run benchmarks
+      working-directory: ${{ github.workspace }}/ur-repo/
+      id: benchmarks
+      run: >
+        taskset -c ${{ env.CORES }} ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py
         ~/bench_workdir
         --sycl ${{ github.workspace }}/sycl_build
         --ur ${{ github.workspace }}/ur_install
diff --git a/scripts/benchmarks/benches/umf.py b/scripts/benchmarks/benches/umf.py
index 738d637cb9..14137ca437 100644
--- a/scripts/benchmarks/benches/umf.py
+++ b/scripts/benchmarks/benches/umf.py
@@ -43,6 +43,8 @@ def benchmarks(self) -> list[Benchmark]:
 
 class ComputeUMFBenchmark(Benchmark):
     def __init__(self, bench, name):
+        super().__init__(bench.directory, bench)
+
         self.bench = bench
         self.bench_name = name
         self.oneapi = get_oneapi()
@@ -55,8 +57,6 @@ def __init__(self, bench, name):
 
         self.col_statistics_time = None
 
-        super().__init__(bench.directory)
-
     def bin_args(self) -> list[str]:
         return []
 

From 4a89e1c69a65acd4f2792743584dfc704086da5e Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Fri, 1 Nov 2024 19:05:08 +0100
Subject: [PATCH 126/148] [Spec] fix urKernelSuggestMaxCooperativeGroupCountExp

Add extra param: ur_device_handle_t

It is necessary to implement this function on L0 for
kernels that are build for multiple devices. Right now,
the implementation only works when the kernel is created
from a native handle.

Ref: https://github.com/oneapi-src/unified-runtime/issues/2262
---
 include/ur_api.h                                   |  3 +++
 include/ur_ddi.h                                   |  1 +
 include/ur_print.hpp                               |  6 ++++++
 scripts/core/exp-cooperative-kernels.yml           |  3 +++
 source/adapters/cuda/kernel.cpp                    |  7 +++++--
 source/adapters/hip/kernel.cpp                     |  6 ++++--
 source/adapters/level_zero/kernel.cpp              |  9 ++++++---
 source/adapters/level_zero/ur_interface_loader.hpp |  5 +++--
 source/adapters/level_zero/v2/api.cpp              |  5 +++--
 source/adapters/mock/ur_mockddi.cpp                |  7 ++++++-
 source/adapters/opencl/kernel.cpp                  |  1 +
 source/loader/layers/tracing/ur_trcddi.cpp         |  9 +++++++--
 source/loader/layers/validation/ur_valddi.cpp      | 12 +++++++++++-
 source/loader/ur_ldrddi.cpp                        |  6 +++++-
 source/loader/ur_libapi.cpp                        |  4 +++-
 source/ur_api.cpp                                  |  2 ++
 16 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 28569597c4..e504a3aa88 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9536,6 +9536,7 @@ urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
+///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
@@ -9543,6 +9544,7 @@ urEnqueueCooperativeKernelLaunchExp(
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel,     ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice,     ///< [in] handle of the device object
     uint32_t workDim,               ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                                     ///< work-items
     const size_t *pLocalWorkSize,   ///< [in] pointer to an array of workDim unsigned values that specify the
@@ -11083,6 +11085,7 @@ typedef struct ur_kernel_set_specialization_constants_params_t {
 ///     allowing the callback the ability to modify the parameter's value
 typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t {
     ur_kernel_handle_t *phKernel;
+    ur_device_handle_t *phDevice;
     uint32_t *pworkDim;
     const size_t **ppLocalWorkSize;
     size_t *pdynamicSharedMemorySize;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index eeb323fc58..2384a68ea1 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -651,6 +651,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)(
 /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)(
     ur_kernel_handle_t,
+    ur_device_handle_t,
     uint32_t,
     const size_t *,
     size_t,
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 5255a20f78..08a2fc6ce2 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -13187,6 +13187,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     ur::details::printPtr(os,
                           *(params->phKernel));
 
+    os << ", ";
+    os << ".hDevice = ";
+
+    ur::details::printPtr(os,
+                          *(params->phDevice));
+
     os << ", ";
     os << ".workDim = ";
 
diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml
index ad3ba0ffba..6020ca5f45 100644
--- a/scripts/core/exp-cooperative-kernels.yml
+++ b/scripts/core/exp-cooperative-kernels.yml
@@ -78,6 +78,9 @@ params:
     - type: $x_kernel_handle_t
       name: hKernel
       desc: "[in] handle of the kernel object"
+    - type: $x_device_handle_t
+      name: hDevice
+      desc: "[in] handle of the device object"
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items"
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 46c4907d4b..340e5ff634 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -190,10 +190,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
-    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+    ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
+    const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
+    uint32_t *pGroupCountRet) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);
 
+  std::ignore = hDevice;
+
   size_t localWorkSize = pLocalWorkSize[0];
   localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1);
   localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1);
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index 1ba50c4360..a5aefb1293 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -169,9 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) {
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
-    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+    ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
+    const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
+    uint32_t *pGroupCountRet) {
   std::ignore = hKernel;
+  std::ignore = hDevice;
   std::ignore = workDim;
   std::ignore = pLocalWorkSize;
   std::ignore = dynamicSharedMemorySize;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index b15b4ce147..db9337289f 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -1054,8 +1054,9 @@ ur_result_t urKernelGetNativeHandle(
 }
 
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
-    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+    ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
+    const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
+    uint32_t *pGroupCountRet) {
   (void)dynamicSharedMemorySize;
   std::shared_lock<ur_shared_mutex> Guard(hKernel->Mutex);
 
@@ -1066,8 +1067,10 @@ ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
   ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2]));
 
   uint32_t TotalGroupCount = 0;
+  ze_kernel_handle_t ZeKernel;
+  UR_CALL(getZeKernel(hDevice->ZeDevice, hKernel, &ZeKernel));
   ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount,
-             (hKernel->ZeKernel, &TotalGroupCount));
+             (ZeKernel, &TotalGroupCount));
   *pGroupCountRet = TotalGroupCount;
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 5bd7c904f1..f2fd6a46d4 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -691,8 +691,9 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp(
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
-    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet);
+    ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
+    const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
+    uint32_t *pGroupCountRet);
 ur_result_t urEnqueueTimestampRecordingExp(
     ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index c04d4cf6ca..5fa6478118 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -560,8 +560,9 @@ ur_result_t urCommandBufferCommandGetInfoExp(
 }
 
 ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
-    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+    ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
+    const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
+    uint32_t *pGroupCountRet) {
   logger::error("{} function not implemented!", __FUNCTION__);
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index b60be1d561..b27c4efaa1 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10057,6 +10057,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items
@@ -10072,7 +10073,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
-        &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
+        &hKernel,
+        &hDevice,
+        &workDim,
+        &pLocalWorkSize,
+        &dynamicSharedMemorySize,
         &pGroupCountRet};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index df160b65eb..fb2c735adc 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -390,6 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     [[maybe_unused]] ur_kernel_handle_t hKernel,
+    [[maybe_unused]] ur_device_handle_t hDevice,
     [[maybe_unused]] uint32_t workDim,
     [[maybe_unused]] const size_t *pLocalWorkSize,
     [[maybe_unused]] size_t dynamicSharedMemorySize,
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 55f8d00bea..3e8043a258 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8633,6 +8633,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items
@@ -8654,7 +8655,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     }
 
     ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
-        &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
+        &hKernel,
+        &hDevice,
+        &workDim,
+        &pLocalWorkSize,
+        &dynamicSharedMemorySize,
         &pGroupCountRet};
     uint64_t instance = getContext()->notify_begin(
         UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
@@ -8664,7 +8669,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     logger.info("   ---> urKernelSuggestMaxCooperativeGroupCountExp\n");
 
     ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
         pGroupCountRet);
 
     getContext()->notify_end(
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 6e48f79edc..d13df673cd 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9656,6 +9656,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items
@@ -9681,6 +9682,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
+        if (NULL == hDevice) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
         if (NULL == pLocalWorkSize) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9695,8 +9700,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
         getContext()->refCountContext->logInvalidReference(hKernel);
     }
 
+    if (getContext()->enableLifetimeValidation &&
+        !getContext()->refCountContext->isReferenceValid(hDevice)) {
+        getContext()->refCountContext->logInvalidReference(hDevice);
+    }
+
     ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
         pGroupCountRet);
 
     return result;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index c74b9d6caf..480678d598 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8844,6 +8844,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items
@@ -8871,9 +8872,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     // convert loader handle to platform handle
     hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
 
+    // convert loader handle to platform handle
+    hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;
+
     // forward to device-platform
     result = pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
         pGroupCountRet);
 
     return result;
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index e257366a7f..fc24d9347b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8935,12 +8935,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
+///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items
@@ -8961,7 +8963,7 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     }
 
     return pfnSuggestMaxCooperativeGroupCountExp(
-        hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
+        hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
         pGroupCountRet);
 } catch (...) {
     return exceptionToResult(std::current_exception());
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 793045bcb4..eb3f20c77b 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7578,12 +7578,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hKernel`
+///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pLocalWorkSize`
 ///         + `NULL == pGroupCountRet`
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    ur_device_handle_t hDevice, ///< [in] handle of the device object
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
                  ///< work-items

From ecb2b08db9cf73377575200baa7c7591c678fb27 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 18 Nov 2024 23:24:38 +0000
Subject: [PATCH 127/148] [L0 v2] enable functions realted to virtual/physical
 memory management

---
 source/adapters/level_zero/CMakeLists.txt     |  3 +
 source/adapters/level_zero/physical_mem.cpp   | 14 ++--
 source/adapters/level_zero/v2/api.cpp         | 75 -------------------
 source/adapters/level_zero/virtual_mem.cpp    | 25 ++++---
 .../enqueue_adapter_level_zero_v2.match       |  1 -
 ...virtual_memory_adapter_level_zero_v2.match | 38 +---------
 6 files changed, 28 insertions(+), 128 deletions(-)

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index cb7e0281af..39031a700d 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -127,6 +127,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/common.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
@@ -135,10 +136,12 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
         # v2-only sources
diff --git a/source/adapters/level_zero/physical_mem.cpp b/source/adapters/level_zero/physical_mem.cpp
index e28b876905..79373aceee 100644
--- a/source/adapters/level_zero/physical_mem.cpp
+++ b/source/adapters/level_zero/physical_mem.cpp
@@ -10,9 +10,13 @@
 
 #include "physical_mem.hpp"
 #include "common.hpp"
-#include "context.hpp"
 #include "device.hpp"
-#include "ur_level_zero.hpp"
+
+#ifdef UR_ADAPTER_LEVEL_ZERO_V2
+#include "v2/context.hpp"
+#else
+#include "context.hpp"
+#endif
 
 namespace ur::level_zero {
 
@@ -25,7 +29,7 @@ ur_result_t urPhysicalMemCreate(
   PhysicalMemDesc.size = size;
 
   ze_physical_mem_handle_t ZePhysicalMem;
-  ZE2UR_CALL(zePhysicalMemCreate, (hContext->ZeContext, hDevice->ZeDevice,
+  ZE2UR_CALL(zePhysicalMemCreate, (hContext->getZeHandle(), hDevice->ZeDevice,
                                    &PhysicalMemDesc, &ZePhysicalMem));
   try {
     *phPhysicalMem = new ur_physical_mem_handle_t_(ZePhysicalMem, hContext);
@@ -46,8 +50,8 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
   if (!hPhysicalMem->RefCount.decrementAndTest())
     return UR_RESULT_SUCCESS;
 
-  ZE2UR_CALL(zePhysicalMemDestroy,
-             (hPhysicalMem->Context->ZeContext, hPhysicalMem->ZePhysicalMem));
+  ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
+                                    hPhysicalMem->ZePhysicalMem));
   delete hPhysicalMem;
 
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index c04d4cf6ca..f774f9e263 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -88,81 +88,6 @@ ur_result_t urSamplerCreateWithNativeHandle(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-ur_result_t urVirtualMemGranularityGetInfo(
-    ur_context_handle_t hContext, ur_device_handle_t hDevice,
-    ur_virtual_mem_granularity_info_t propName, size_t propSize,
-    void *pPropValue, size_t *pPropSizeRet) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemReserve(ur_context_handle_t hContext,
-                                const void *pStart, size_t size,
-                                void **ppStart) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemFree(ur_context_handle_t hContext, const void *pStart,
-                             size_t size) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemMap(ur_context_handle_t hContext, const void *pStart,
-                            size_t size, ur_physical_mem_handle_t hPhysicalMem,
-                            size_t offset,
-                            ur_virtual_mem_access_flags_t flags) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemUnmap(ur_context_handle_t hContext, const void *pStart,
-                              size_t size) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemSetAccess(ur_context_handle_t hContext,
-                                  const void *pStart, size_t size,
-                                  ur_virtual_mem_access_flags_t flags) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urVirtualMemGetInfo(ur_context_handle_t hContext,
-                                const void *pStart, size_t size,
-                                ur_virtual_mem_info_t propName, size_t propSize,
-                                void *pPropValue, size_t *pPropSizeRet) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urPhysicalMemCreate(ur_context_handle_t hContext,
-                                ur_device_handle_t hDevice, size_t size,
-                                const ur_physical_mem_properties_t *pProperties,
-                                ur_physical_mem_handle_t *phPhysicalMem) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemGetInfo(
-    ur_physical_mem_handle_t hPhysicalMem, ur_physical_mem_info_t propName,
-    size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 ur_result_t
 urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
                       const ur_kernel_arg_sampler_properties_t *pProperties,
diff --git a/source/adapters/level_zero/virtual_mem.cpp b/source/adapters/level_zero/virtual_mem.cpp
index e89899ded7..092edc8ed1 100644
--- a/source/adapters/level_zero/virtual_mem.cpp
+++ b/source/adapters/level_zero/virtual_mem.cpp
@@ -9,11 +9,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "common.hpp"
-#include "context.hpp"
 #include "device.hpp"
 #include "logger/ur_logger.hpp"
 #include "physical_mem.hpp"
-#include "ur_level_zero.hpp"
+
+#ifdef UR_ADAPTER_LEVEL_ZERO_V2
+#include "v2/context.hpp"
+#else
+#include "context.hpp"
+#endif
 
 namespace ur::level_zero {
 
@@ -30,7 +34,7 @@ ur_result_t urVirtualMemGranularityGetInfo(
     // aligned size.
     size_t PageSize;
     ZE2UR_CALL(zeVirtualMemQueryPageSize,
-               (hContext->ZeContext, hDevice->ZeDevice, 1, &PageSize));
+               (hContext->getZeHandle(), hDevice->ZeDevice, 1, &PageSize));
     return ReturnValue(PageSize);
   }
   default:
@@ -44,14 +48,15 @@ ur_result_t urVirtualMemGranularityGetInfo(
 ur_result_t urVirtualMemReserve(ur_context_handle_t hContext,
                                 const void *pStart, size_t size,
                                 void **ppStart) {
-  ZE2UR_CALL(zeVirtualMemReserve, (hContext->ZeContext, pStart, size, ppStart));
+  ZE2UR_CALL(zeVirtualMemReserve,
+             (hContext->getZeHandle(), pStart, size, ppStart));
 
   return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urVirtualMemFree(ur_context_handle_t hContext, const void *pStart,
                              size_t size) {
-  ZE2UR_CALL(zeVirtualMemFree, (hContext->ZeContext, pStart, size));
+  ZE2UR_CALL(zeVirtualMemFree, (hContext->getZeHandle(), pStart, size));
 
   return UR_RESULT_SUCCESS;
 }
@@ -66,7 +71,7 @@ ur_result_t urVirtualMemSetAccess(ur_context_handle_t hContext,
     AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY;
 
   ZE2UR_CALL(zeVirtualMemSetAccessAttribute,
-             (hContext->ZeContext, pStart, size, AccessAttr));
+             (hContext->getZeHandle(), pStart, size, AccessAttr));
 
   return UR_RESULT_SUCCESS;
 }
@@ -82,15 +87,15 @@ ur_result_t urVirtualMemMap(ur_context_handle_t hContext, const void *pStart,
     AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY;
 
   ZE2UR_CALL(zeVirtualMemMap,
-             (hContext->ZeContext, pStart, size, hPhysicalMem->ZePhysicalMem,
-              offset, AccessAttr));
+             (hContext->getZeHandle(), pStart, size,
+              hPhysicalMem->ZePhysicalMem, offset, AccessAttr));
 
   return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urVirtualMemUnmap(ur_context_handle_t hContext, const void *pStart,
                               size_t size) {
-  ZE2UR_CALL(zeVirtualMemUnmap, (hContext->ZeContext, pStart, size));
+  ZE2UR_CALL(zeVirtualMemUnmap, (hContext->getZeHandle(), pStart, size));
 
   return UR_RESULT_SUCCESS;
 }
@@ -106,7 +111,7 @@ ur_result_t urVirtualMemGetInfo(ur_context_handle_t hContext,
     size_t QuerySize;
     ze_memory_access_attribute_t Access;
     ZE2UR_CALL(zeVirtualMemGetAccessAttribute,
-               (hContext->ZeContext, pStart, size, &Access, &QuerySize));
+               (hContext->getZeHandle(), pStart, size, &Access, &QuerySize));
     ur_virtual_mem_access_flags_t RetFlags = 0;
     if (Access & ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE)
       RetFlags |= UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE;
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
index 44895d10fa..7b1739df4e 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
@@ -1,6 +1,5 @@
 urEnqueueKernelLaunchTest.InvalidKernelArgs/*
 urEnqueueKernelLaunchKernelWgSizeTest.Success/*
-urEnqueueKernelLaunchWithVirtualMemory.Success/*
 {{OPT}}urEnqueueKernelLaunchIncrementTest.Success/*__UseEventsEnabled
 {{OPT}}urEnqueueKernelLaunchIncrementTest.Success/*__UseEventsDisabled
 {{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/UseEventsNoQueuePerThread
diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
index 531773a246..633aa41f90 100644
--- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
+++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match
@@ -1,42 +1,6 @@
-urPhysicalMemCreateWithSizeParamTest.Success/*
-urPhysicalMemCreateWithFlagsParamTest.Success/*
-urPhysicalMemCreateTest.InvalidNullHandleContext/*
-urPhysicalMemCreateTest.InvalidNullHandleDevice/*
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/*
-urPhysicalMemCreateTest.InvalidEnumeration/*
+{{OPT}}urPhysicalMemCreateWithSizeParamTest.Success/*
 urPhysicalMemCreateWithSizeParamTest.InvalidSize/*
-urPhysicalMemReleaseTest.Success/*
-urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/*
-urPhysicalMemRetainTest.Success/*
-urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/*
 urPhysicalMemGetInfoTest.Context/*
 urPhysicalMemGetInfoTest.Device/*
 urPhysicalMemGetInfoTest.Size/*
 urPhysicalMemGetInfoTest.Properties/*
-urPhysicalMemGetInfoTest.ReferenceCount/*
-urVirtualMemFreeTest.Success/*
-urVirtualMemFreeTest.InvalidNullHandleContext/*
-urVirtualMemFreeTest.InvalidNullPointerStart/*
-urVirtualMemGetInfoTestWithParam.Success/*__UR_VIRTUAL_MEM_INFO_ACCESS_MODE
-urVirtualMemGetInfoTest.InvalidNullHandleContext/*
-urVirtualMemGetInfoTest.InvalidNullPointerStart/*
-urVirtualMemGetInfoTest.InvalidEnumerationInfo/*
-urVirtualMemGranularityGetInfoTest.Success/*__UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM
-urVirtualMemGranularityGetInfoTest.Success/*__UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED
-urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/*
-urVirtualMemMapWithFlagsTest.Success/*
-urVirtualMemMapTest.InvalidNullHandleContext/*
-urVirtualMemMapTest.InvalidNullHandlePhysicalMem/*
-urVirtualMemMapTest.InvalidNullPointerStart/*
-urVirtualMemMapTest.InvalidEnumerationFlags/*
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/*
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/*
-urVirtualMemReserveTest.InvalidNullHandleContext/*
-urVirtualMemReserveTest.InvalidNullPointer/*
-urVirtualMemSetAccessWithFlagsTest.Success/*
-urVirtualMemSetAccessTest.InvalidNullHandleContext/*
-urVirtualMemSetAccessTest.InvalidNullPointerStart/*
-urVirtualMemSetAccessTest.InvalidEnumeration/*
-urVirtualMemUnmapTest.Success/*
-urVirtualMemUnmapTest.InvalidNullHandleContext/*
-urVirtualMemUnmapTest.InvalidNullPointerStart/*

From be27d8f06e559890ca47c131b3346e0e2a551057 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Fri, 13 Dec 2024 11:28:27 -0800
Subject: [PATCH 128/148] [L0] Fix external semaphore import function calls to
 match the header

- Fix the function pointers for the external semaphore import function
  calls to match the updated header.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/image.cpp    |  6 +++---
 source/adapters/level_zero/platform.hpp | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 8437fcff95..09bdc16a64 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -1237,7 +1237,7 @@ ur_result_t urBindlessImagesImportExternalSemaphoreExp(
   }
 
   ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt.zexImportExternalSemaphoreExp,
-             (hDevice->ZeDevice, &ExtSemaphoreHandle, &SemDesc));
+             (hDevice->ZeDevice, &SemDesc, &ExtSemaphoreHandle));
   *phExternalSemaphoreHandle =
       (ur_exp_external_semaphore_handle_t)ExtSemaphoreHandle;
 
@@ -1310,7 +1310,7 @@ ur_result_t urBindlessImagesWaitExternalSemaphoreExp(
       reinterpret_cast<ze_intel_external_semaphore_exp_handle_t>(hSemaphore);
   ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt
                  .zexCommandListAppendWaitExternalSemaphoresExp,
-             (ZeCommandList, &hExtSemaphore, &WaitParams, 1, ZeEvent,
+             (ZeCommandList, 1, &hExtSemaphore, &WaitParams, ZeEvent,
               WaitList.Length, WaitList.ZeEventList));
 
   return UR_RESULT_SUCCESS;
@@ -1373,7 +1373,7 @@ ur_result_t urBindlessImagesSignalExternalSemaphoreExp(
 
   ZE2UR_CALL(UrPlatform->ZeExternalSemaphoreExt
                  .zexCommandListAppendSignalExternalSemaphoresExp,
-             (ZeCommandList, &hExtSemaphore, &SignalParams, 1, ZeEvent,
+             (ZeCommandList, 1, &hExtSemaphore, &SignalParams, ZeEvent,
               WaitList.Length, WaitList.ZeEventList));
 
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp
index 4b613fb1e5..748460158c 100644
--- a/source/adapters/level_zero/platform.hpp
+++ b/source/adapters/level_zero/platform.hpp
@@ -119,17 +119,17 @@ struct ur_platform_handle_t_ : public _ur_platform {
   struct ZeExternalSemaphoreExtension {
     bool Supported = false;
     ze_result_t (*zexImportExternalSemaphoreExp)(
-        ze_device_handle_t, ze_intel_external_semaphore_exp_handle_t *,
-        const ze_intel_external_semaphore_exp_desc_t *);
+        ze_device_handle_t, const ze_intel_external_semaphore_exp_desc_t *,
+        ze_intel_external_semaphore_exp_handle_t *);
     ze_result_t (*zexCommandListAppendWaitExternalSemaphoresExp)(
-        ze_command_list_handle_t,
+        ze_command_list_handle_t, unsigned int,
         const ze_intel_external_semaphore_exp_handle_t *,
-        const ze_intel_external_semaphore_wait_exp_params_t *, unsigned int,
+        const ze_intel_external_semaphore_wait_exp_params_t *,
         ze_event_handle_t, uint32_t, ze_event_handle_t *);
     ze_result_t (*zexCommandListAppendSignalExternalSemaphoresExp)(
-        ze_command_list_handle_t,
+        ze_command_list_handle_t, size_t,
         const ze_intel_external_semaphore_exp_handle_t *,
-        const ze_intel_external_semaphore_signal_exp_params_t *, size_t,
+        const ze_intel_external_semaphore_signal_exp_params_t *,
         ze_event_handle_t, uint32_t, ze_event_handle_t *);
     ze_result_t (*zexDeviceReleaseExternalSemaphoreExp)(
         ze_intel_external_semaphore_exp_handle_t);

From 64ae812e9dc0c0369094ed7874c58bf85e5301ca Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 11 Dec 2024 10:46:28 -0800
Subject: [PATCH 129/148] [L0] Update L0 Init checking to print details in
 error log

- Changed the L0 Init failures to goto the error log vs debug and always
  return an error to the error log if no init is possible.
- Enabled for calling zeParseError to generate the string of L0 error
  codes in all UR L0 code.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/adapter.cpp | 13 +++++++++----
 source/adapters/level_zero/common.cpp  |  2 +-
 source/adapters/level_zero/common.hpp  |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp
index 7dff6bcf14..68aa852595 100644
--- a/source/adapters/level_zero/adapter.cpp
+++ b/source/adapters/level_zero/adapter.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "adapter.hpp"
+#include "common.hpp"
 #include "ur_level_zero.hpp"
 #include <iomanip>
 
@@ -162,7 +163,7 @@ ur_result_t initPlatforms(PlatformVec &platforms,
     ZE2UR_CALL(zeDriverGet, (&ZeDriverGetCount, ZeDriverGetHandles.data()));
   }
   if (ZeDriverGetCount == 0 && GlobalAdapter->ZeInitDriversCount == 0) {
-    logger::debug("\nNo Valid L0 Drivers found.\n");
+    logger::error("\nNo Valid L0 Drivers found.\n");
     return UR_RESULT_SUCCESS;
   }
 
@@ -376,7 +377,9 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
                     static_cast<int>(L0InitFlags));
       GlobalAdapter->ZeInitResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags));
       if (GlobalAdapter->ZeInitResult != ZE_RESULT_SUCCESS) {
-        logger::debug("\nzeInit failed with {}\n", GlobalAdapter->ZeInitResult);
+        const char *ErrorString = "Unknown";
+        zeParseError(GlobalAdapter->ZeInitResult, ErrorString);
+        logger::error("\nzeInit failed with {}\n", ErrorString);
       }
 
       bool useInitDrivers = false;
@@ -422,8 +425,9 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
           if (GlobalAdapter->ZeInitDriversResult == ZE_RESULT_SUCCESS) {
             GlobalAdapter->InitDriversSupported = true;
           } else {
-            logger::debug("\nzeInitDrivers failed with {}\n",
-                          GlobalAdapter->ZeInitDriversResult);
+            const char *ErrorString = "Unknown";
+            zeParseError(GlobalAdapter->ZeInitDriversResult, ErrorString);
+            logger::error("\nzeInitDrivers failed with {}\n", ErrorString);
           }
         }
       }
@@ -441,6 +445,7 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
 
     // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
     if (*GlobalAdapter->ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
+      logger::error("Level Zero Uninitialized\n");
       result = std::move(platforms);
       return;
     }
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index 3b3f59e055..e13afc179f 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -88,7 +88,7 @@ ZeUSMImportExtension ZeUSMImport;
 
 std::map<std::string, int> *ZeCallCount = nullptr;
 
-inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
+void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
   switch (ZeError) {
 #define ZE_ERRCASE(ERR)                                                        \
   case ERR:                                                                    \
diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 8a93993752..09d144df82 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -340,6 +340,9 @@ bool setEnvVar(const char *name, const char *value);
 // Map Level Zero runtime error code to UR error code.
 ur_result_t ze2urResult(ze_result_t ZeResult);
 
+// Parse Level Zero error code and return the error string.
+void zeParseError(ze_result_t ZeError, const char *&ErrorString);
+
 // Trace a call to Level-Zero RT
 #define ZE2UR_CALL(ZeName, ZeArgs)                                             \
   {                                                                            \

From d05b5d5b5bf887c0eae98301e965ab55ca158531 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Mon, 16 Dec 2024 09:55:18 +0100
Subject: [PATCH 130/148] init LaunchInfo

---
 source/loader/CMakeLists.txt                           | 1 +
 source/loader/layers/sanitizer/asan/asan_ddi.cpp       | 1 +
 source/loader/layers/sanitizer/asan/asan_libdevice.hpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index d8f6056ae9..a10e99f422 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -136,6 +136,7 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_libdevice.hpp
diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 380b51a0da..9378544d65 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -464,6 +464,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
     LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
                           pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
                           workDim);
+    UR_CALL(LaunchInfo.Data.syncToDevice(hQueue));
 
     UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
 
diff --git a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
index a2d5ecd6be..4c6aaaeac8 100644
--- a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
@@ -66,7 +66,7 @@ struct AsanRuntimeData {
     uint32_t Debug = 0;
 
     int ReportFlag = 0;
-    AsanErrorReport Report[ASAN_MAX_NUM_REPORTS];
+    AsanErrorReport Report[ASAN_MAX_NUM_REPORTS] = {};
 };
 
 constexpr unsigned ASAN_SHADOW_SCALE = 4;

From 390d0b556d67cb31abb87854016df4ca86d18ea8 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Mon, 16 Dec 2024 09:59:01 +0100
Subject: [PATCH 131/148] remove unused var

---
 source/loader/layers/sanitizer/asan/asan_ddi.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 9378544d65..bf4dff157a 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -1420,7 +1420,6 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
     getContext()->logger.debug("==== urKernelSetArgValue");
 
     std::shared_ptr<MemBuffer> MemBuffer;
-    std::shared_ptr<KernelInfo> KernelInfo;
     if (argSize == sizeof(ur_mem_handle_t) &&
         (MemBuffer = getAsanInterceptor()->getMemBuffer(
              *ur_cast<const ur_mem_handle_t *>(pArgValue)))) {
@@ -1453,7 +1452,6 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
     getContext()->logger.debug("==== urKernelSetArgMemObj");
 
     std::shared_ptr<MemBuffer> MemBuffer;
-    std::shared_ptr<KernelInfo> KernelInfo;
     if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) {
         auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);

From 1a5bb688a7dda429072b53b2f8f785f1d7936320 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Fri, 13 Dec 2024 14:41:08 +0100
Subject: [PATCH 132/148] [CI] Gather platform's info in all HW workflows

---
 .github/workflows/benchmarks-reusable.yml | 5 +++++
 .github/workflows/e2e_core.yml            | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index 46bdecb4ca..83c05f896c 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -236,3 +236,8 @@ jobs:
       with:
         path: ur-repo/benchmark_results.html
         key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }}
+
+    - name: Get information about platform
+      if: ${{ always() }}
+      working-directory: ${{ github.workspace }}/ur-repo/
+      run: .github/scripts/get_system_info.sh
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index f12913c648..c98ec21288 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -192,6 +192,11 @@ jobs:
       id: tests
       run: ninja -C build-e2e check-sycl-e2e || echo "e2e tests have failed. Ignoring failure."
 
+    - name: Get information about platform
+      if: ${{ always() }}
+      working-directory: ${{github.workspace}}/ur-repo
+      run: .github/scripts/get_system_info.sh
+
     # FIXME: Requires pull-request: write permissions but this is only granted
     # on pull requests from forks if using pull_request_target workflow
     # trigger but not the pull_request trigger..

From faabeacbe249def8f357410fcee98923f717de4d Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Mon, 16 Dec 2024 12:57:48 +0100
Subject: [PATCH 133/148] [benchmarks] use template for html output

Using a simple f-string was starting to get annoying
with the increased use of javascript.
---
 .../benchmark_results.html.template           | 192 ++++++++++++++++
 scripts/benchmarks/output_html.py             | 207 +-----------------
 2 files changed, 204 insertions(+), 195 deletions(-)
 create mode 100644 scripts/benchmarks/benchmark_results.html.template

diff --git a/scripts/benchmarks/benchmark_results.html.template b/scripts/benchmarks/benchmark_results.html.template
new file mode 100644
index 0000000000..1deeedad66
--- /dev/null
+++ b/scripts/benchmarks/benchmark_results.html.template
@@ -0,0 +1,192 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Benchmark Results</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+            margin: 0;
+            padding: 16px;
+            background: #f8f9fa;
+        }
+        .container {
+            max-width: 1100px;
+            margin: 0 auto;
+        }
+        h1, h2 {
+            color: #212529;
+            text-align: center;
+            margin-bottom: 24px;
+            font-weight: 500;
+        }
+        .chart {
+            background: white;
+            border-radius: 8px;
+            padding: 24px;
+            margin-bottom: 24px;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+            overflow-x: auto;
+        }
+        .chart > div {
+            min-width: 600px;
+            margin: 0 auto;
+        }
+        @media (max-width: 768px) {
+            body {
+                padding: 12px;
+            }
+            .chart {
+                padding: 16px;
+                border-radius: 6px;
+            }
+            h1 {
+                font-size: 24px;
+                margin-bottom: 16px;
+            }
+        }
+        .filter-container {
+            text-align: center;
+            margin-bottom: 24px;
+        }
+        .filter-container input {
+            padding: 8px;
+            font-size: 16px;
+            border: 1px solid #ccc;
+            border-radius: 4px;
+            width: 400px;
+            max-width: 100%;
+        }
+        .suite-filter-container {
+            text-align: center;
+            margin-bottom: 24px;
+            padding: 16px;
+            background: #e9ecef;
+            border-radius: 8px;
+        }
+        .suite-checkbox {
+            margin: 0 8px;
+        }
+        details {
+            margin-bottom: 24px;
+        }
+        summary {
+            font-size: 18px;
+            font-weight: 500;
+            cursor: pointer;
+            padding: 12px;
+            background: #e9ecef;
+            border-radius: 8px;
+            user-select: none;
+        }
+        summary:hover {
+            background: #dee2e6;
+        }
+    </style>
+    <script>
+        function getQueryParam(param) {
+            const urlParams = new URLSearchParams(window.location.search);
+            return urlParams.get(param);
+        }
+
+        function filterCharts() {
+            const regexInput = document.getElementById('bench-filter').value;
+            const regex = new RegExp(regexInput, 'i');
+            const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
+            const charts = document.querySelectorAll('.chart');
+
+            charts.forEach(chart => {
+                const label = chart.getAttribute('data-label');
+                const suite = chart.getAttribute('data-suite');
+                if (regex.test(label) && activeSuites.includes(suite)) {
+                    chart.style.display = '';
+                } else {
+                    chart.style.display = 'none';
+                }
+            });
+
+            updateURL();
+        }
+
+        function updateURL() {
+            const url = new URL(window.location);
+            const regex = document.getElementById('bench-filter').value;
+            const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
+
+            if (regex) {
+                url.searchParams.set('regex', regex);
+            } else {
+                url.searchParams.delete('regex');
+            }
+
+            if (activeSuites.length > 0) {
+                url.searchParams.set('suites', activeSuites.join(','));
+            } else {
+                url.searchParams.delete('suites');
+            }
+
+            history.replaceState(null, '', url);
+        }
+
+        document.addEventListener('DOMContentLoaded', (event) => {
+            const regexParam = getQueryParam('regex');
+            const suitesParam = getQueryParam('suites');
+
+            if (regexParam) {
+                document.getElementById('bench-filter').value = regexParam;
+            }
+
+            const suiteCheckboxes = document.querySelectorAll('.suite-checkbox');
+            if (suitesParam) {
+                const suites = suitesParam.split(',');
+                suiteCheckboxes.forEach(checkbox => {
+                    if (suites.includes(checkbox.getAttribute('data-suite'))) {
+                        checkbox.checked = true;
+                    } else {
+                        checkbox.checked = false;
+                    }
+                });
+            } else {
+                suiteCheckboxes.forEach(checkbox => {
+                    checkbox.checked = true;
+                });
+            }
+            filterCharts();
+
+            suiteCheckboxes.forEach(checkbox => {
+                checkbox.addEventListener('change', () => {
+                    filterCharts();
+                });
+            });
+
+            document.getElementById('bench-filter').addEventListener('input', () => {
+                filterCharts();
+            });
+        });
+    </script>
+</head>
+<body>
+    <div class="container">
+        <h1>Benchmark Results</h1>
+        <div class="filter-container">
+            <input type="text" id="bench-filter" placeholder="Regex...">
+        </div>
+        <div class="suite-filter-container">
+            ${suite_checkboxes_html}
+        </div>
+        <details class="timeseries">
+            <summary>Historical Results</summary>
+            <div class="charts">
+                ${timeseries_charts_html}
+            </div>
+        </details>
+        <details class="bar-charts">
+            <summary>Comparisons</summary>
+            <div class="charts">
+                ${bar_charts_html}
+            </div>
+        </details>
+    </div>
+</body>
+</html>
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 35072a72b4..9c09602354 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -11,6 +11,7 @@
 import matplotlib.dates as mdates
 from benches.result import BenchmarkRun, Result
 import numpy as np
+from string import Template
 
 @dataclass
 class BenchmarkMetadata:
@@ -255,198 +256,14 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
     suite_names = {t.suite for t in timeseries}
     suite_checkboxes_html = ' '.join(f'<label><input type="checkbox" class="suite-checkbox" data-suite="{suite}" checked> {suite}</label>' for suite in suite_names)
 
-    html_template = f"""
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <meta charset="utf-8">
-        <meta name="viewport" content="width=device-width, initial-scale=1">
-        <title>Benchmark Results</title>
-        <style>
-            body {{
-                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-                margin: 0;
-                padding: 16px;
-                background: #f8f9fa;
-            }}
-            .container {{
-                max-width: 1100px;
-                margin: 0 auto;
-            }}
-            h1, h2 {{
-                color: #212529;
-                text-align: center;
-                margin-bottom: 24px;
-                font-weight: 500;
-            }}
-            .chart {{
-                background: white;
-                border-radius: 8px;
-                padding: 24px;
-                margin-bottom: 24px;
-                box-shadow: 0 1px 3px rgba(0,0,0,0.1);
-                overflow-x: auto;
-            }}
-            .chart > div {{
-                min-width: 600px;
-                margin: 0 auto;
-            }}
-            @media (max-width: 768px) {{
-                body {{
-                    padding: 12px;
-                }}
-                .chart {{
-                    padding: 16px;
-                    border-radius: 6px;
-                }}
-                h1 {{
-                    font-size: 24px;
-                    margin-bottom: 16px;
-                }}
-            }}
-            .filter-container {{
-                text-align: center;
-                margin-bottom: 24px;
-            }}
-            .filter-container input {{
-                padding: 8px;
-                font-size: 16px;
-                border: 1px solid #ccc;
-                border-radius: 4px;
-                width: 400px;
-                max-width: 100%;
-            }}
-            .suite-filter-container {{
-                text-align: center;
-                margin-bottom: 24px;
-                padding: 16px;
-                background: #e9ecef;
-                border-radius: 8px;
-            }}
-            .suite-checkbox {{
-                margin: 0 8px;
-            }}
-            details {{
-                margin-bottom: 24px;
-            }}
-            summary {{
-                font-size: 18px;
-                font-weight: 500;
-                cursor: pointer;
-                padding: 12px;
-                background: #e9ecef;
-                border-radius: 8px;
-                user-select: none;
-            }}
-            summary:hover {{
-                background: #dee2e6;
-            }}
-        </style>
-        <script>
-            function getQueryParam(param) {{
-                const urlParams = new URLSearchParams(window.location.search);
-                return urlParams.get(param);
-            }}
-
-            function filterCharts() {{
-                const regexInput = document.getElementById('bench-filter').value;
-                const regex = new RegExp(regexInput, 'i');
-                const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
-                const charts = document.querySelectorAll('.chart');
-
-                charts.forEach(chart => {{
-                    const label = chart.getAttribute('data-label');
-                    const suite = chart.getAttribute('data-suite');
-                    if (regex.test(label) && activeSuites.includes(suite)) {{
-                        chart.style.display = '';
-                    }} else {{
-                        chart.style.display = 'none';
-                    }}
-                }});
-
-                updateURL();
-            }}
-
-            function updateURL() {{
-                const url = new URL(window.location);
-                const regex = document.getElementById('bench-filter').value;
-                const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
-
-                if (regex) {{
-                    url.searchParams.set('regex', regex);
-                }} else {{
-                    url.searchParams.delete('regex');
-                }}
-
-                if (activeSuites.length > 0) {{
-                    url.searchParams.set('suites', activeSuites.join(','));
-                }} else {{
-                    url.searchParams.delete('suites');
-                }}
-
-                history.replaceState(null, '', url);
-            }}
-
-            document.addEventListener('DOMContentLoaded', (event) => {{
-                const regexParam = getQueryParam('regex');
-                const suitesParam = getQueryParam('suites');
-
-                if (regexParam) {{
-                    document.getElementById('bench-filter').value = regexParam;
-                }}
-
-                const suiteCheckboxes = document.querySelectorAll('.suite-checkbox');
-                if (suitesParam) {{
-                    const suites = suitesParam.split(',');
-                    suiteCheckboxes.forEach(checkbox => {{
-                        if (suites.includes(checkbox.getAttribute('data-suite'))) {{
-                            checkbox.checked = true;
-                        }} else {{
-                            checkbox.checked = false;
-                        }}
-                    }});
-                }} else {{
-                    suiteCheckboxes.forEach(checkbox => {{
-                        checkbox.checked = true;
-                    }});
-                }}
-                filterCharts();
-
-                suiteCheckboxes.forEach(checkbox => {{
-                    checkbox.addEventListener('change', () => {{
-                        filterCharts();
-                    }});
-                }});
-
-                document.getElementById('bench-filter').addEventListener('input', () => {{
-                    filterCharts();
-                }});
-            }});
-        </script>
-    </head>
-    <body>
-        <div class="container">
-            <h1>Benchmark Results</h1>
-            <div class="filter-container">
-                <input type="text" id="bench-filter" placeholder="Regex...">
-            </div>
-            <div class="suite-filter-container">
-                {suite_checkboxes_html}
-            </div>
-            <details class="timeseries">
-                <summary>Historical Results</summary>
-                <div class="charts">
-                    {timeseries_charts_html}
-                </div>
-            </details>
-            <details class="bar-charts">
-                <summary>Comparisons</summary>
-                <div class="charts">
-                    {bar_charts_html}
-                </div>
-            </details>
-        </div>
-    </body>
-    </html>
-    """
-    return html_template
+    with open('benchmark_results.html.template', 'r') as file:
+        html_template = file.read()
+
+    template = Template(html_template)
+    data = {
+        'suite_checkboxes_html': suite_checkboxes_html,
+        'timeseries_charts_html': timeseries_charts_html,
+        'bar_charts_html': bar_charts_html,
+    }
+
+    return template.substitute(data)

From 1bfff1799933f23e31d21efbb33a8f2d46dbfa3c Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Mon, 16 Dec 2024 12:58:26 +0100
Subject: [PATCH 134/148] [benchmarks] use "unknown" suite for old results

This fixes the empty "" checkbox on the results html.
---
 scripts/benchmarks/benches/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index aa1459cbb3..dfa270689f 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -27,7 +27,7 @@ class Result:
     lower_is_better: bool = True
     git_hash: str = ""
     date: Optional[datetime] = None
-    suite: str = ""
+    suite: str = "Unknown"
 
 @dataclass_json
 @dataclass

From c0a372bd53ce3da18d736e159c03c906057f2057 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Fri, 9 Feb 2024 15:46:00 +0000
Subject: [PATCH 135/148] Explicitly define which info queries are optional.

This is now reflected in the spec and in the CTS tests.

Also includes a number of minor fixes for adapter implementations of
related info queries.
---
 include/ur_api.h                              |  90 +++++++-------
 scripts/YaML.md                               |   1 +
 scripts/core/PROG.rst                         |  18 +++
 scripts/core/context.yml                      |   8 +-
 scripts/core/device.yml                       |  34 ++---
 scripts/core/kernel.yml                       |  12 +-
 scripts/core/program.yml                      |   4 +-
 scripts/core/queue.yml                        |   2 +-
 scripts/core/usm.yml                          |   2 +-
 scripts/generate_code.py                      |  21 ++++
 scripts/json2src.py                           |   4 +
 scripts/templates/helper.py                   |  35 +++++-
 scripts/templates/optional_queries.h.mako     |  45 +++++++
 scripts/templates/tools-info.hpp.mako         |  20 +--
 source/adapters/cuda/device.cpp               |   1 +
 source/adapters/cuda/program.cpp              |   8 +-
 source/adapters/hip/device.cpp                |  11 +-
 source/adapters/hip/program.cpp               |   8 +-
 source/adapters/level_zero/device.cpp         |   7 +-
 source/adapters/level_zero/event.cpp          |   2 +-
 source/adapters/level_zero/queue.cpp          |   2 +-
 source/adapters/native_cpu/device.cpp         |   7 +-
 source/adapters/opencl/device.cpp             |  18 +--
 source/adapters/opencl/usm.cpp                |   2 +-
 test/conformance/adapter/urAdapterGetInfo.cpp |   3 +-
 test/conformance/context/urContextGetInfo.cpp |   4 +-
 .../device/device_adapter_cuda.match          |   1 +
 .../device/device_adapter_level_zero.match    |   1 +
 .../device/device_adapter_native_cpu.match    |  16 +--
 test/conformance/device/urDeviceGetInfo.cpp   |  60 +++++----
 .../enqueue/enqueue_adapter_level_zero.match  |  10 --
 .../enqueue_adapter_level_zero_v2.match       |  10 --
 test/conformance/event/urEventGetInfo.cpp     |   3 +-
 .../event/urEventGetProfilingInfo.cpp         |   5 +-
 .../kernel/kernel_adapter_opencl.match        |   1 -
 .../kernel/urKernelGetGroupInfo.cpp           |  18 ++-
 test/conformance/kernel/urKernelGetInfo.cpp   |   5 +-
 .../kernel/urKernelGetSubGroupInfo.cpp        |   6 +-
 test/conformance/memory/urMemGetInfo.cpp      |   3 +-
 test/conformance/memory/urMemImageGetInfo.cpp |   3 +-
 .../platform/urPlatformGetInfo.cpp            |   3 +-
 .../program/program_adapter_level_zero.match  |   1 +
 .../program_adapter_level_zero_v2.match       |   1 +
 .../program/urProgramGetBuildInfo.cpp         |  17 +--
 test/conformance/program/urProgramGetInfo.cpp |  25 ++--
 .../queue/queue_adapter_native_cpu.match      |   1 +
 test/conformance/queue/urQueueGetInfo.cpp     |  23 ++--
 test/conformance/sampler/urSamplerGetInfo.cpp |   3 +-
 test/conformance/testing/include/uur/checks.h |  16 +++
 .../testing/include/uur/fixtures.h            |   6 +
 .../testing/include/uur/optional_queries.h    | 117 ++++++++++++++++++
 test/conformance/usm/urUSMGetMemAllocInfo.cpp |   5 +-
 test/conformance/usm/urUSMPoolGetInfo.cpp     |   3 +-
 .../usm/usm_adapter_native_cpu.match          |   1 -
 test/conformance/usm/usm_adapter_opencl.match |   1 -
 .../virtual_memory/urVirtualMemGetInfo.cpp    |   6 +-
 .../urVirtualMemGranularityGetInfo.cpp        |   6 +-
 57 files changed, 517 insertions(+), 229 deletions(-)
 create mode 100644 scripts/templates/optional_queries.h.mako
 delete mode 100644 test/conformance/kernel/kernel_adapter_opencl.match
 create mode 100644 test/conformance/testing/include/uur/optional_queries.h
 delete mode 100644 test/conformance/usm/usm_adapter_opencl.match

diff --git a/include/ur_api.h b/include/ur_api.h
index 3662f3bbc5..1d2d43f489 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -1493,7 +1493,7 @@ urDeviceGetSelected(
 typedef enum ur_device_info_t {
     UR_DEVICE_INFO_TYPE = 0,                                         ///< [::ur_device_type_t] type of the device
     UR_DEVICE_INFO_VENDOR_ID = 1,                                    ///< [uint32_t] vendor Id of the device
-    UR_DEVICE_INFO_DEVICE_ID = 2,                                    ///< [uint32_t] Id of the device
+    UR_DEVICE_INFO_DEVICE_ID = 2,                                    ///< [uint32_t][optional-query] Id of the device
     UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3,                            ///< [uint32_t] the number of compute units
     UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4,                     ///< [uint32_t] max work item dimensions
     UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5,                          ///< [size_t[]] return an array of max work item sizes
@@ -1520,7 +1520,7 @@ typedef enum ur_device_info_t {
     UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23,                  ///< [uint32_t] native vector width for double
     UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24,                    ///< [uint32_t] native vector width for half float
     UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25,                         ///< [uint32_t] max clock frequency in MHz
-    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                           ///< [uint32_t] memory clock frequency in MHz
+    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                           ///< [uint32_t][optional-query] memory clock frequency in MHz
     UR_DEVICE_INFO_ADDRESS_BITS = 27,                                ///< [uint32_t] address bits
     UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28,                          ///< [uint64_t] max memory allocation size
     UR_DEVICE_INFO_IMAGE_SUPPORTED = 29,                             ///< [::ur_bool_t] images are supported
@@ -1544,7 +1544,8 @@ typedef enum ur_device_info_t {
     UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44,                   ///< [uint32_t] global memory cache line size in bytes
     UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45,                       ///< [uint64_t] size of global memory cache in bytes
     UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46,                             ///< [uint64_t] size of global memory in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                             ///< [uint64_t] size of global memory which is free in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                             ///< [uint64_t][optional-query] size of global memory which is free in
+                                                                     ///< bytes
     UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48,                    ///< [uint64_t] max constant buffer size in bytes
     UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49,                           ///< [uint32_t] max number of __const declared arguments in a kernel
     UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50,                              ///< [::ur_device_local_mem_type_t] local memory type
@@ -1601,15 +1602,16 @@ typedef enum ur_device_info_t {
                                                                      ///< shared memory access
     UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87,                   ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
                                                                      ///< shared memory access
-    UR_DEVICE_INFO_UUID = 88,                                        ///< [uint8_t[]] return device UUID
-    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                 ///< [char[]] return device PCI address
-    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                                ///< [uint32_t] return Intel GPU EU count
-    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                           ///< [uint32_t] return Intel GPU EU SIMD width
-    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                               ///< [uint32_t] return Intel GPU number of slices
-    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                   ///< [uint32_t] return Intel GPU EU count per subslice
-    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                     ///< [uint32_t] return Intel GPU number of subslices per slice
-    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                       ///< [uint32_t] return Intel GPU number of threads per EU
-    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                        ///< [uint32_t] return max memory bandwidth in Mb/s
+    UR_DEVICE_INFO_UUID = 88,                                        ///< [uint8_t[]][optional-query] return device UUID
+    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                 ///< [char[]][optional-query] return device PCI address
+    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                                ///< [uint32_t][optional-query] return Intel GPU EU count
+    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                           ///< [uint32_t][optional-query] return Intel GPU EU SIMD width
+    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                               ///< [uint32_t][optional-query] return Intel GPU number of slices
+    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                   ///< [uint32_t][optional-query] return Intel GPU EU count per subslice
+    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                     ///< [uint32_t][optional-query] return Intel GPU number of subslices per
+                                                                     ///< slice
+    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                       ///< [uint32_t][optional-query] return Intel GPU number of threads per EU
+    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                        ///< [uint32_t][optional-query] return max memory bandwidth in Mb/s
     UR_DEVICE_INFO_IMAGE_SRGB = 97,                                  ///< [::ur_bool_t] device supports sRGB images
     UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98,                          ///< [::ur_bool_t] Return true if sub-device should do its own program
                                                                      ///< build
@@ -1628,23 +1630,24 @@ typedef enum ur_device_info_t {
                                                                      ///< available for this device.
     UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106,        ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
                                                                      ///< point
-    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                           ///< [uint32_t] return the width in bits of the memory bus interface of the
-                                                                     ///< device.
+    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                           ///< [uint32_t][optional-query] return the width in bits of the memory bus
+                                                                     ///< interface of the device.
     UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108,                         ///< [size_t[3]] return max 3D work groups
     UR_DEVICE_INFO_ASYNC_BARRIER = 109,                              ///< [::ur_bool_t] return true if Async Barrier is supported
     UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110,                        ///< [::ur_bool_t] return true if specifying memory channels is supported
     UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111,             ///< [::ur_bool_t] Return true if the device supports enqueueing commands
                                                                      ///< to read and write pipes from the host.
-    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,               ///< [uint32_t] The maximum number of registers available per block.
-    UR_DEVICE_INFO_IP_VERSION = 113,                                 ///< [uint32_t] The device IP version. The meaning of the device IP version
-                                                                     ///< is implementation-defined, but newer devices should have a higher
-                                                                     ///< version than older devices.
+    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,               ///< [uint32_t][optional-query] The maximum number of registers available
+                                                                     ///< per block.
+    UR_DEVICE_INFO_IP_VERSION = 113,                                 ///< [uint32_t][optional-query] The device IP version. The meaning of the
+                                                                     ///< device IP version is implementation-defined, but newer devices should
+                                                                     ///< have a higher version than older devices.
     UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114,                     ///< [::ur_bool_t] return true if the device supports virtual memory.
     UR_DEVICE_INFO_ESIMD_SUPPORT = 115,                              ///< [::ur_bool_t] return true if the device supports ESIMD.
-    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                          ///< [::ur_device_handle_t[]] The set of component devices contained by
-                                                                     ///< this composite device.
-    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                           ///< [::ur_device_handle_t] The composite device containing this component
-                                                                     ///< device.
+    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                          ///< [::ur_device_handle_t[]][optional-query] The set of component devices
+                                                                     ///< contained by this composite device.
+    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                           ///< [::ur_device_handle_t][optional-query] The composite device containing
+                                                                     ///< this component device.
     UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT = 118,                    ///< [::ur_bool_t] return true if the device supports the
                                                                      ///< `EnqueueDeviceGlobalVariableWrite` and
                                                                      ///< `EnqueueDeviceGlobalVariableRead` entry points.
@@ -2290,15 +2293,15 @@ typedef enum ur_context_info_t {
                                                           ///< supported.
     UR_CONTEXT_INFO_USM_FILL2D_SUPPORT = 4,               ///< [::ur_bool_t] to indicate if the ::urEnqueueUSMFill2D entrypoint is
                                                           ///< supported.
-    UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 5, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                          ///< memory order capabilities.
-    UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 6, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                          ///< memory scope capabilities.
-    UR_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 7,  ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                          ///< memory fence order capabilities.
+    UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 5, ///< [::ur_memory_order_capability_flags_t][optional-query] return a
+                                                          ///< bit-field of atomic memory order capabilities.
+    UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 6, ///< [::ur_memory_scope_capability_flags_t][optional-query] return a
+                                                          ///< bit-field of atomic memory scope capabilities.
+    UR_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 7,  ///< [::ur_memory_order_capability_flags_t][optional-query] return a
+                                                          ///< bit-field of atomic memory fence order capabilities.
                                                           ///< Zero is returned if the backend does not support context-level fences.
-    UR_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 8,  ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                          ///< memory fence scope capabilities.
+    UR_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 8,  ///< [::ur_memory_scope_capability_flags_t][optional-query] return a
+                                                          ///< bit-field of atomic memory fence scope capabilities.
                                                           ///< Zero is returned if the backend does not support context-level fences.
     /// @cond
     UR_CONTEXT_INFO_FORCE_UINT32 = 0x7fffffff
@@ -3395,7 +3398,7 @@ typedef enum ur_usm_alloc_info_t {
     UR_USM_ALLOC_INFO_BASE_PTR = 1, ///< [void *] Memory allocation base pointer info
     UR_USM_ALLOC_INFO_SIZE = 2,     ///< [size_t] Memory allocation size info
     UR_USM_ALLOC_INFO_DEVICE = 3,   ///< [::ur_device_handle_t] Memory allocation device info
-    UR_USM_ALLOC_INFO_POOL = 4,     ///< [::ur_usm_pool_handle_t] Memory allocation pool info
+    UR_USM_ALLOC_INFO_POOL = 4,     ///< [::ur_usm_pool_handle_t][optional-query] Memory allocation pool info
     /// @cond
     UR_USM_ALLOC_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -4585,9 +4588,10 @@ typedef enum ur_program_info_t {
     UR_PROGRAM_INFO_BINARY_SIZES = 5,    ///< [size_t[]] Return program binary sizes for each device.
     UR_PROGRAM_INFO_BINARIES = 6,        ///< [unsigned char[]] Return program binaries for all devices for this
                                          ///< Program.
-    UR_PROGRAM_INFO_NUM_KERNELS = 7,     ///< [size_t] Number of kernels in Program, return type size_t.
-    UR_PROGRAM_INFO_KERNEL_NAMES = 8,    ///< [char[]] Return a null-terminated, semi-colon separated list of kernel
-                                         ///< names in Program.
+    UR_PROGRAM_INFO_NUM_KERNELS = 7,     ///< [size_t][optional-query] Number of kernels in Program, return type
+                                         ///< size_t.
+    UR_PROGRAM_INFO_KERNEL_NAMES = 8,    ///< [char[]][optional-query] Return a null-terminated, semi-colon
+                                         ///< separated list of kernel names in Program.
     /// @cond
     UR_PROGRAM_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -4938,8 +4942,8 @@ typedef enum ur_kernel_info_t {
     UR_KERNEL_INFO_CONTEXT = 3,         ///< [::ur_context_handle_t] Return Context object associated with Kernel.
     UR_KERNEL_INFO_PROGRAM = 4,         ///< [::ur_program_handle_t] Return Program object associated with Kernel.
     UR_KERNEL_INFO_ATTRIBUTES = 5,      ///< [char[]] Return null-terminated kernel attributes string.
-    UR_KERNEL_INFO_NUM_REGS = 6,        ///< [uint32_t] Return the number of registers used by the compiled kernel
-                                        ///< (device specific).
+    UR_KERNEL_INFO_NUM_REGS = 6,        ///< [uint32_t][optional-query] Return the number of registers used by the
+                                        ///< compiled kernel.
     /// @cond
     UR_KERNEL_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -4949,7 +4953,7 @@ typedef enum ur_kernel_info_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Get Kernel Work Group information
 typedef enum ur_kernel_group_info_t {
-    UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = 0,                   ///< [size_t[3]] Return Work Group maximum global size
+    UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = 0,                   ///< [size_t[3]][optional-query] Return Work Group maximum global size
     UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE = 1,                    ///< [size_t] Return maximum Work Group size
     UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE = 2,            ///< [size_t[3]] Return Work Group size required by the source code, such
                                                                  ///< as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if
@@ -4958,10 +4962,10 @@ typedef enum ur_kernel_group_info_t {
     UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
     UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5,                   ///< [size_t] Return minimum amount of private memory in bytes used by each
                                                                  ///< work item in the Kernel
-    UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6,        ///< [size_t[3]] Return the maximum Work Group size guaranteed by the
-                                                                 ///< source code, or (0, 0, 0) if unspecified
-    UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z)
-                                                                 ///< guaranteed by the source code, or 0 if unspecified
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6,        ///< [size_t[3]][optional-query] Return the maximum Work Group size guaranteed
+                                                                 ///< by the source code, or (0, 0, 0) if unspecified
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t][optional-query] Return the maximum linearized Work Group size
+                                                                 ///< (X * Y * Z) guaranteed by the source code, or 0 if unspecified
     /// @cond
     UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -5480,8 +5484,8 @@ typedef enum ur_queue_info_t {
     UR_QUEUE_INFO_SIZE = 5,            ///< [uint32_t] The size of the queue on the device. Only a valid query
                                        ///< if the queue was created with the `ON_DEVICE` queue flag, otherwise
                                        ///< `::urQueueGetInfo` will return `::UR_RESULT_ERROR_INVALID_QUEUE`.
-    UR_QUEUE_INFO_EMPTY = 6,           ///< [::ur_bool_t] return true if the queue was empty at the time of the
-                                       ///< query
+    UR_QUEUE_INFO_EMPTY = 6,           ///< [::ur_bool_t][optional-query] return true if the queue was empty at
+                                       ///< the time of the query.
     /// @cond
     UR_QUEUE_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
diff --git a/scripts/YaML.md b/scripts/YaML.md
index 2ab5dd80b2..3679bdec8e 100644
--- a/scripts/YaML.md
+++ b/scripts/YaML.md
@@ -336,6 +336,7 @@ plural form *enumerators* is abbreviated to `etors`.
   - An etor requires the following scalar fields: {`name`, `desc`}
     + `desc` will be used as the etors's description comment
     + If the enum has `typed_etors`, `desc` must begin with type identifier: {`"[type]"`}
+    + `desc` may contain the [optional-query] annotation. This denotes the etor as an info query which is optional for adapters to implement, and may legally result in a non-success error code.
     + `name` must be a unique ISO-C standard identifier, and be all caps
   - An etor may take the following optional scalar field: {`value`, `version`}
     + `value` must be an ISO-C standard identifier
diff --git a/scripts/core/PROG.rst b/scripts/core/PROG.rst
index ea0445cb60..54123bbc1f 100644
--- a/scripts/core/PROG.rst
+++ b/scripts/core/PROG.rst
@@ -183,6 +183,24 @@ explicitly created against a context.
     // Release the context handle
     ${x}ContextRelease(hContext);    
 
+Object Queries
+==============
+
+Queries to get information from API objects follow a common pattern. The entry
+points for this are generally of the form:
+
+.. code-block::
+
+   ObjectGetInfo(ur_object_handle_t hObject, ur_object_info_t propName,
+                 size_t propSize, void *pPropValue, size_t *pPropSizeRet)
+
+where ``propName`` selects the information to query out. The object info enum
+representing possible queries will generally be found in the enums section of
+the relevant object. Some info queries would be difficult or impossible to
+support for certain backends, these are denoted with [optional-query] in the
+enum description. Using any enum marked optional in this way may result in
+${X}_RESULT_ERROR_UNSUPPORTED_ENUMERATION if the adapter doesn't support it.
+
 Programs and Kernels
 ====================
 
diff --git a/scripts/core/context.yml b/scripts/core/context.yml
index e45c93010d..9b316a21a7 100644
--- a/scripts/core/context.yml
+++ b/scripts/core/context.yml
@@ -102,16 +102,16 @@ etors:
     - name: USM_FILL2D_SUPPORT
       desc: "[$x_bool_t] to indicate if the $xEnqueueUSMFill2D entrypoint is supported."
     - name: ATOMIC_MEMORY_ORDER_CAPABILITIES
-      desc: "[$x_memory_order_capability_flags_t] return a bit-field of atomic memory order capabilities."
+      desc: "[$x_memory_order_capability_flags_t][optional-query] return a bit-field of atomic memory order capabilities."
     - name: ATOMIC_MEMORY_SCOPE_CAPABILITIES
-      desc: "[$x_memory_scope_capability_flags_t] return a bit-field of atomic memory scope capabilities."
+      desc: "[$x_memory_scope_capability_flags_t][optional-query] return a bit-field of atomic memory scope capabilities."
     - name: ATOMIC_FENCE_ORDER_CAPABILITIES
       desc: |
-            [$x_memory_order_capability_flags_t] return a bit-field of atomic memory fence order capabilities.
+            [$x_memory_order_capability_flags_t][optional-query] return a bit-field of atomic memory fence order capabilities.
             Zero is returned if the backend does not support context-level fences.
     - name: ATOMIC_FENCE_SCOPE_CAPABILITIES
       desc: |
-            [$x_memory_scope_capability_flags_t] return a bit-field of atomic memory fence scope capabilities.
+            [$x_memory_scope_capability_flags_t][optional-query] return a bit-field of atomic memory fence scope capabilities.
             Zero is returned if the backend does not support context-level fences.
 --- #--------------------------------------------------------------------------
 type: function
diff --git a/scripts/core/device.yml b/scripts/core/device.yml
index c430ff0b36..ce671c24d6 100644
--- a/scripts/core/device.yml
+++ b/scripts/core/device.yml
@@ -200,7 +200,7 @@ etors:
     - name: VENDOR_ID
       desc: "[uint32_t] vendor Id of the device"
     - name: DEVICE_ID
-      desc: "[uint32_t] Id of the device"
+      desc: "[uint32_t][optional-query] Id of the device"
     - name: MAX_COMPUTE_UNITS
       desc: "[uint32_t] the number of compute units"
     - name: MAX_WORK_ITEM_DIMENSIONS
@@ -248,7 +248,7 @@ etors:
     - name: MAX_CLOCK_FREQUENCY
       desc: "[uint32_t] max clock frequency in MHz"
     - name: MEMORY_CLOCK_RATE
-      desc: "[uint32_t] memory clock frequency in MHz"
+      desc: "[uint32_t][optional-query] memory clock frequency in MHz"
     - name: ADDRESS_BITS
       desc: "[uint32_t] address bits"
     - name: MAX_MEM_ALLOC_SIZE
@@ -290,7 +290,7 @@ etors:
     - name: GLOBAL_MEM_SIZE
       desc: "[uint64_t] size of global memory in bytes"
     - name: GLOBAL_MEM_FREE
-      desc: "[uint64_t] size of global memory which is free in bytes"
+      desc: "[uint64_t][optional-query] size of global memory which is free in bytes"
     - name: MAX_CONSTANT_BUFFER_SIZE
       desc: "[uint64_t] max constant buffer size in bytes"
     - name: MAX_CONSTANT_ARGS
@@ -377,23 +377,23 @@ etors:
     - name: USM_SYSTEM_SHARED_SUPPORT
       desc: "[$x_device_usm_access_capability_flags_t] support USM system wide shared memory access"
     - name: UUID
-      desc: "[uint8_t[]] return device UUID"
+      desc: "[uint8_t[]][optional-query] return device UUID"
     - name: PCI_ADDRESS
-      desc: "[char[]] return device PCI address"
+      desc: "[char[]][optional-query] return device PCI address"
     - name: GPU_EU_COUNT
-      desc: "[uint32_t] return Intel GPU EU count"
+      desc: "[uint32_t][optional-query] return Intel GPU EU count"
     - name: GPU_EU_SIMD_WIDTH
-      desc: "[uint32_t] return Intel GPU EU SIMD width"
+      desc: "[uint32_t][optional-query] return Intel GPU EU SIMD width"
     - name: GPU_EU_SLICES
-      desc: "[uint32_t] return Intel GPU number of slices"
+      desc: "[uint32_t][optional-query] return Intel GPU number of slices"
     - name: GPU_EU_COUNT_PER_SUBSLICE
-      desc: "[uint32_t] return Intel GPU EU count per subslice"
+      desc: "[uint32_t][optional-query] return Intel GPU EU count per subslice"
     - name: GPU_SUBSLICES_PER_SLICE
-      desc: "[uint32_t] return Intel GPU number of subslices per slice"
+      desc: "[uint32_t][optional-query] return Intel GPU number of subslices per slice"
     - name: GPU_HW_THREADS_PER_EU
-      desc: "[uint32_t] return Intel GPU number of threads per EU"
+      desc: "[uint32_t][optional-query] return Intel GPU number of threads per EU"
     - name: MAX_MEMORY_BANDWIDTH
-      desc: "[uint32_t] return max memory bandwidth in Mb/s"
+      desc: "[uint32_t][optional-query] return max memory bandwidth in Mb/s"
     - name: IMAGE_SRGB
       desc: "[$x_bool_t] device supports sRGB images"
     - name: BUILD_ON_SUBDEVICE
@@ -418,7 +418,7 @@ etors:
     - name: KERNEL_SET_SPECIALIZATION_CONSTANTS
       desc: "[$x_bool_t] support the $xKernelSetSpecializationConstants entry point"
     - name: MEMORY_BUS_WIDTH
-      desc: "[uint32_t] return the width in bits of the memory bus interface of the device."
+      desc: "[uint32_t][optional-query] return the width in bits of the memory bus interface of the device."
     - name: MAX_WORK_GROUPS_3D
       desc: "[size_t[3]] return max 3D work groups"
     - name: ASYNC_BARRIER
@@ -428,17 +428,17 @@ etors:
     - name: HOST_PIPE_READ_WRITE_SUPPORTED
       desc: "[$x_bool_t] Return true if the device supports enqueueing commands to read and write pipes from the host."
     - name: MAX_REGISTERS_PER_WORK_GROUP
-      desc: "[uint32_t] The maximum number of registers available per block."
+      desc: "[uint32_t][optional-query] The maximum number of registers available per block."
     - name: IP_VERSION
-      desc: "[uint32_t] The device IP version. The meaning of the device IP version is implementation-defined, but newer devices should have a higher version than older devices."
+      desc: "[uint32_t][optional-query] The device IP version. The meaning of the device IP version is implementation-defined, but newer devices should have a higher version than older devices."
     - name: VIRTUAL_MEMORY_SUPPORT
       desc: "[$x_bool_t] return true if the device supports virtual memory."
     - name: ESIMD_SUPPORT
       desc: "[$x_bool_t] return true if the device supports ESIMD."
     - name: COMPONENT_DEVICES
-      desc: "[$x_device_handle_t[]] The set of component devices contained by this composite device."
+      desc: "[$x_device_handle_t[]][optional-query] The set of component devices contained by this composite device."
     - name: COMPOSITE_DEVICE
-      desc: "[$x_device_handle_t] The composite device containing this component device."
+      desc: "[$x_device_handle_t][optional-query] The composite device containing this component device."
     - name: GLOBAL_VARIABLE_SUPPORT
       desc: "[$x_bool_t] return true if the device supports the `EnqueueDeviceGlobalVariableWrite` and `EnqueueDeviceGlobalVariableRead` entry points."
     - name: USM_POOL_SUPPORT
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 5bd95e1847..6ffa9ca5e0 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -124,7 +124,7 @@ etors:
     - name: ATTRIBUTES
       desc: "[char[]] Return null-terminated kernel attributes string."
     - name: NUM_REGS
-      desc: "[uint32_t] Return the number of registers used by the compiled kernel (device specific)."
+      desc: "[uint32_t][optional-query] Return the number of registers used by the compiled kernel."
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Get Kernel Work Group information"
@@ -133,7 +133,7 @@ name: $x_kernel_group_info_t
 typed_etors: True
 etors:
     - name: GLOBAL_WORK_SIZE
-      desc: "[size_t[3]] Return Work Group maximum global size"
+      desc: "[size_t[3]][optional-query] Return Work Group maximum global size"
     - name: WORK_GROUP_SIZE
       desc: "[size_t] Return maximum Work Group size"
     - name: COMPILE_WORK_GROUP_SIZE
@@ -146,12 +146,12 @@ etors:
       desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel"
     - name: COMPILE_MAX_WORK_GROUP_SIZE
       desc: |
-            [size_t[3]] Return the maximum Work Group size guaranteed by the
-            source code, or (0, 0, 0) if unspecified
+            [size_t[3]][optional-query] Return the maximum Work Group size guaranteed
+            by the source code, or (0, 0, 0) if unspecified
     - name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
       desc: |
-            [size_t] Return the maximum linearized Work Group size (X * Y * Z)
-            guaranteed by the source code, or 0 if unspecified
+            [size_t][optional-query] Return the maximum linearized Work Group size
+            (X * Y * Z) guaranteed by the source code, or 0 if unspecified
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Get Kernel SubGroup information"
diff --git a/scripts/core/program.yml b/scripts/core/program.yml
index 769a312f1d..0449a58d6d 100644
--- a/scripts/core/program.yml
+++ b/scripts/core/program.yml
@@ -387,9 +387,9 @@ etors:
     - name: BINARIES
       desc: "[unsigned char[]] Return program binaries for all devices for this Program."
     - name: NUM_KERNELS
-      desc: "[size_t] Number of kernels in Program, return type size_t."
+      desc: "[size_t][optional-query] Number of kernels in Program, return type size_t."
     - name: KERNEL_NAMES
-      desc: "[char[]] Return a null-terminated, semi-colon separated list of kernel names in Program."
+      desc: "[char[]][optional-query] Return a null-terminated, semi-colon separated list of kernel names in Program."
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Query information about a Program object"
diff --git a/scripts/core/queue.yml b/scripts/core/queue.yml
index c8a6528fbd..cd8bd2668e 100644
--- a/scripts/core/queue.yml
+++ b/scripts/core/queue.yml
@@ -37,7 +37,7 @@ etors:
             if the queue was created with the `ON_DEVICE` queue flag, otherwise
             `$xQueueGetInfo` will return `$X_RESULT_ERROR_INVALID_QUEUE`.
     - name: EMPTY
-      desc: "[$x_bool_t] return true if the queue was empty at the time of the query"
+      desc: "[$x_bool_t][optional-query] return true if the queue was empty at the time of the query."
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Queue property flags"
diff --git a/scripts/core/usm.yml b/scripts/core/usm.yml
index db112ed8eb..a3cf080e16 100644
--- a/scripts/core/usm.yml
+++ b/scripts/core/usm.yml
@@ -74,7 +74,7 @@ etors:
     - name: DEVICE
       desc: "[$x_device_handle_t] Memory allocation device info"
     - name: POOL
-      desc: "[$x_usm_pool_handle_t] Memory allocation pool info"
+      desc: "[$x_usm_pool_handle_t][optional-query] Memory allocation pool info"
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "USM memory advice"
diff --git a/scripts/generate_code.py b/scripts/generate_code.py
index 0c7476ab42..cdc3dfa229 100644
--- a/scripts/generate_code.py
+++ b/scripts/generate_code.py
@@ -548,3 +548,24 @@ def generate_level_zero_queue_api(path, section, namespace, tags, version, specs
             specs=specs,
             meta=meta)
     print("QUEUE Generated %s lines of code.\n" % loc)
+
+"""
+Entry-point:
+    generates headers used by the CTS, for example containing meta-information
+    about info query enums
+"""
+def generate_cts_headers(path, section, namespace, tags, version, specs, meta):
+    template = "optional_queries.h.mako"
+    fin = os.path.join("templates", template)
+    name = "optional_queries"
+    filename = "optional_queries.h"
+    dstpath = os.path.join(path, "conformance", "testing", "include", "uur")
+    fout  = os.path.join(dstpath, filename)
+
+    print("Generating %s..." % fout)
+
+    loc = util.makoWrite(fin, fout,
+            filename = name, namespace = namespace,
+            tags = tags, specs = specs, meta = meta)
+
+    print("CTS Generated %s lines of code.\n" % loc)
diff --git a/scripts/json2src.py b/scripts/json2src.py
index df11f879ac..baff735d89 100755
--- a/scripts/json2src.py
+++ b/scripts/json2src.py
@@ -32,6 +32,7 @@ def add_argument(parser, name, help, default=False):
     add_argument(parser, "common", "generation of common files.", True)
     add_argument(parser, "tools", "generation of common files.", True)
     add_argument(parser, "l0_queue", "generation of l0 queue abstractions.", True)
+    add_argument(parser, "cts", "generation of cts headers", True)
     parser.add_argument("--debug", action='store_true', help="dump intermediate data to disk.")
     parser.add_argument("--sections", type=list, default=None, help="Optional list of sections for which to generate source, default is all")
     parser.add_argument("--ver", type=str, default="1.0", help="specification version to generate.")
@@ -45,6 +46,7 @@ def add_argument(parser, name, help, default=False):
 
     srcpath = os.path.join(args.out_dir, "source")
     toolspath = os.path.join(args.out_dir, "tools")
+    testpath = os.path.join(args.out_dir, "test")
 
     for idx, specs in enumerate(input['specs']):
         config = input['configs'][idx]
@@ -63,6 +65,8 @@ def add_argument(parser, name, help, default=False):
                 generate_code.generate_tools(toolspath, config['name'], config['namespace'], config['tags'], args.ver, specs, input['meta'])
             if args.l0_queue:
                 generate_code.generate_level_zero_queue_api(srcpath, config['name'], config['namespace'], config['tags'], args.ver, specs, input['meta'])
+            if args.cts:
+                generate_code.generate_cts_headers(testpath, config['name'], config['namespace'], config['tags'], args.ver, specs, input['meta'])
 
     if args.debug:
         util.makoFileListWrite("generated.json")
diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py
index 9c08f8be11..b0f8ec13d1 100644
--- a/scripts/templates/helper.py
+++ b/scripts/templates/helper.py
@@ -541,6 +541,18 @@ def is_global(item, tags):
         except:
             return False
 
+"""
+    Extracts traits from an enumerator
+"""
+class etor_traits:
+    RE_OPTIONAL_QUERY = r".*\[optional-query\].*"
+
+    @classmethod
+    def is_optional_query(cls, item):
+        try:
+            return True if re.match(cls.RE_OPTIONAL_QUERY, item['desc']) else False
+        except:
+            return False
 
 """
 Public:
@@ -780,7 +792,7 @@ def make_etor_name(namespace, tags, enum, etor, meta=None):
     returns the associated type of an etor from a typed enum
 """
 def etor_get_associated_type(namespace, tags, item):
-    match = re.match(r'^\[(.+)\]\s', item['desc'])
+    match = re.match(r'^\[([$A-Za-z0-9_*[\] ]+)\]', item['desc'])
     if match:
         associated_type = match.group(1)
         return subt(namespace, tags, associated_type)
@@ -1692,3 +1704,24 @@ def transform_queue_related_function_name(namespace, tags, obj, format = ["name"
     params = params[1:]
 
     return "{}({})".format(function_name, ", ".join(params))
+
+"""
+Public:
+    Returns a dictionary mapping info enum types to the list of optional queries
+    within that enum. If an enum type doesn't have any optional queries it will
+    not appear in the dictionary as a key.
+"""
+def get_optional_queries(specs, namespace, tags):
+    optional_queries = {}
+    for s in specs:
+        for obj in s['objects']:
+            if obj['type'] == 'enum':
+                optional_etors = []
+                for e in obj['etors']:
+                    if etor_traits.is_optional_query(e):
+                        name = make_enum_name(namespace, tags, e)
+                        optional_etors.append(name)
+                if optional_etors:
+                    type_name = make_type_name(namespace, tags, obj)
+                    optional_queries[type_name] = optional_etors
+    return optional_queries
diff --git a/scripts/templates/optional_queries.h.mako b/scripts/templates/optional_queries.h.mako
new file mode 100644
index 0000000000..6dab60c884
--- /dev/null
+++ b/scripts/templates/optional_queries.h.mako
@@ -0,0 +1,45 @@
+<%!
+import re
+from templates import helper as th
+%><%
+optional_queries = th.get_optional_queries(specs, namespace, tags)
+%>/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file ${filename}.h
+ *
+ */
+
+ // Auto-generated file, do not edit.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <ur_api.h>
+
+namespace uur {
+
+template <class T> bool isQueryOptional(T) { return false; }
+
+%for type, names in optional_queries.items():
+constexpr std::array optional_${type} = {
+%for name in names:
+    ${name},
+%endfor
+};
+
+template <> inline bool isQueryOptional(${type} query) {
+    return std::find(optional_${type}.begin(),
+                     optional_${type}.end(),
+                     query) != optional_${type}.end();
+}
+
+%endfor
+
+}
diff --git a/scripts/templates/tools-info.hpp.mako b/scripts/templates/tools-info.hpp.mako
index ed9d67bb34..cff5cb837f 100644
--- a/scripts/templates/tools-info.hpp.mako
+++ b/scripts/templates/tools-info.hpp.mako
@@ -32,8 +32,9 @@ namespace urinfo {
 inline void printLoaderConfigInfos(${x}_loader_config_handle_t hLoaderConfig, std::string_view prefix = "  ") {
 %for etor in obj['etors']:
 %if 'REFERENCE_COUNT' not in etor['name']:
-    std::cout << prefix;
-    printLoaderConfigInfo<${etor['desc'][1:etor['desc'].find(' ')-1].replace('$x', x)}>(hLoaderConfig, ${etor['name'].replace('$X', X)});
+    <%etype = th.etor_get_associated_type(n, tags, etor)
+    %>std::cout << prefix;
+    printLoaderConfigInfo<${etype}>(hLoaderConfig, ${etor['name'].replace('$X', X)});
 %endif
 %endfor
 }
@@ -42,8 +43,9 @@ inline void printLoaderConfigInfos(${x}_loader_config_handle_t hLoaderConfig, st
 inline void printAdapterInfos(${x}_adapter_handle_t hAdapter, std::string_view prefix = "  ") {
 %for etor in obj['etors']:
 %if 'REFERENCE_COUNT' not in etor['name']:
-    std::cout << prefix;
-    printAdapterInfo<${etor['desc'][1:etor['desc'].find(' ')-1].replace('$x', x)}>(hAdapter, ${etor['name'].replace('$X', X)});
+    <%etype = th.etor_get_associated_type(n, tags, etor)
+    %>std::cout << prefix;
+    printAdapterInfo<${etype}>(hAdapter, ${etor['name'].replace('$X', X)});
 %endif
 %endfor
 }
@@ -52,8 +54,9 @@ inline void printAdapterInfos(${x}_adapter_handle_t hAdapter, std::string_view p
 %if obj["name"] == '$x_platform_info_t':
 inline void printPlatformInfos(${x}_platform_handle_t hPlatform, std::string_view prefix = "    ") {
 %for etor in obj['etors']:
-    std::cout << prefix;
-    printPlatformInfo<${etor['desc'][1:etor['desc'].find(' ')-1].replace('$x', x)}>(hPlatform, ${etor['name'].replace('$X', X)});
+    <%etype = th.etor_get_associated_type(n, tags, etor)
+    %>std::cout << prefix;
+    printPlatformInfo<${etype}>(hPlatform, ${etor['name'].replace('$X', X)});
 %endfor
 }
 
@@ -61,11 +64,12 @@ inline void printPlatformInfos(${x}_platform_handle_t hPlatform, std::string_vie
 %if obj['name'] == '$x_device_info_t':
 inline void printDeviceInfos(${x}_device_handle_t hDevice, std::string_view prefix = "      ") {
 %for etor in obj['etors']:
-    std::cout << prefix;
+    <%etype = th.etor_get_associated_type(n, tags, etor)
+    %>std::cout << prefix;
 %if etor['name'] == '$X_DEVICE_INFO_UUID':
     printDeviceUUID(hDevice, ${etor['name'].replace('$X', X)});
 %else:
-    printDeviceInfo<${etor['desc'][1:etor['desc'].find(' ')-1].replace('$x', x)}>(hDevice, ${etor['name'].replace('$X', X)});
+    printDeviceInfo<${etype}>(hDevice, ${etor['name'].replace('$X', X)});
 %endif
 %endfor
 }
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index 25b4cc7348..476e0c40a7 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1087,6 +1087,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+  case UR_DEVICE_INFO_IP_VERSION:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return ReturnValue(
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index dfd5e9e6b8..eacf78ea00 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -414,8 +414,14 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
     // In SYCL this is only used in kernel bundle when building from source
     // which isn't currently supported for CUDA.
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
-  case UR_PROGRAM_INFO_NUM_KERNELS:
   case UR_PROGRAM_INFO_IL:
+    // Cuda only supports urProgramCreateWithBinary, so we can always return
+    // nothing for INFO_IL.
+    if (pPropSizeRet) {
+      *pPropSizeRet = 0;
+    }
+    return UR_RESULT_SUCCESS;
+  case UR_PROGRAM_INFO_NUM_KERNELS:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index 89fb05a562..c5ccee5bf4 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -904,6 +904,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(ur_bool_t{false});
   case UR_DEVICE_INFO_USM_POOL_SUPPORT:
     return ReturnValue(ur_bool_t{true});
+  case UR_DEVICE_INFO_BFLOAT16:
+    return ReturnValue(true);
+  case UR_DEVICE_INFO_ASYNC_BARRIER:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_IL_VERSION:
+    return ReturnValue("");
+
   // TODO: Investigate if this information is available on HIP.
   case UR_DEVICE_INFO_COMPONENT_DEVICES:
   case UR_DEVICE_INFO_COMPOSITE_DEVICE:
@@ -914,9 +921,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
-  case UR_DEVICE_INFO_BFLOAT16:
-  case UR_DEVICE_INFO_IL_VERSION:
-  case UR_DEVICE_INFO_ASYNC_BARRIER:
+  case UR_DEVICE_INFO_IP_VERSION:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return ReturnValue(
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index eae3fda366..442eeeea74 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -404,8 +404,14 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
     // which isn't currently supported for HIP.
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_PROGRAM_INFO_NUM_KERNELS:
-  case UR_PROGRAM_INFO_IL:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_PROGRAM_INFO_IL:
+    // HIP only supports urProgramCreateWithBinary, so we can always return
+    // nothing for INFO_IL.
+    if (pPropSizeRet) {
+      *pPropSizeRet = 0;
+    }
+    return UR_RESULT_SUCCESS;
   default:
     break;
   }
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 6705c4c659..373826f0fd 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -883,9 +883,6 @@ ur_result_t urDeviceGetInfo(
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice});
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU});
-  case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
-    // currently not supported in level zero runtime
-    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_BFLOAT16: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
     return ReturnValue(ur_bool_t{false});
@@ -1187,6 +1184,10 @@ ur_result_t urDeviceGetInfo(
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 #endif
   }
+  case UR_DEVICE_INFO_ASYNC_BARRIER:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED:
+    return ReturnValue(false);
   default:
     logger::error("Unsupported ParamName in urGetDeviceInfo");
     logger::error("ParamNameParamName={}(0x{})", ParamName,
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index eae16f0c57..649319867d 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -470,7 +470,7 @@ ur_result_t urEventGetInfo(
   }
   case UR_EVENT_INFO_COMMAND_TYPE: {
     std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    return ReturnValue(ur_cast<uint64_t>(Event->CommandType));
+    return ReturnValue(ur_cast<ur_command_t>(Event->CommandType));
   }
   case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
     // Check to see if the event's Queue has an open command list due to
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 43b5f16cd4..572397aa5f 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -456,7 +456,7 @@ ur_result_t urQueueGetInfo(
     logger::error(
         "Unsupported ParamName in urQueueGetInfo: ParamName=ParamName={}(0x{})",
         ParamName, logger::toHex(ParamName));
-    return UR_RESULT_ERROR_INVALID_VALUE;
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
 
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 69c8bfc784..48cdf9a404 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -365,7 +365,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D:
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE:
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
-    return UR_RESULT_ERROR_INVALID_VALUE;
+  case UR_DEVICE_INFO_GLOBAL_MEM_FREE:
+  case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
+  case UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP:
+  case UR_DEVICE_INFO_IP_VERSION:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return ReturnValue(
         static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));
@@ -400,7 +404,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // These two are exclusive of L0.
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 
-    CASE_UR_UNSUPPORTED(UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH);
   case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT:
     return ReturnValue(false);
 
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index f92068436b..dc5343b51c 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -536,7 +536,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
           cl_adapter::cast<cl_device_id>(hDevice), {"cl_khr_fp16"}, Supported));
 
       if (!Supported) {
-        return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+        // If we don't support the extension then our capabilities are 0.
+        ur_device_fp_capability_flags_t halfCapabilities = 0;
+        return ReturnValue(halfCapabilities);
       }
     }
 
@@ -779,9 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(true);
   }
 
-  case UR_DEVICE_INFO_BFLOAT16: {
-    return ReturnValue(false);
-  }
   case UR_DEVICE_INFO_ATOMIC_64: {
     bool Supported = false;
     UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
@@ -1126,7 +1125,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(UUID);
   }
 
-  case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: {
+  // We can't query to check if these are supported, they will need to be
+  // manually updated if support is ever implemented.
+  case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS:
+  case UR_DEVICE_INFO_BFLOAT16:
+  case UR_DEVICE_INFO_ASYNC_BARRIER: {
     return ReturnValue(false);
   }
 
@@ -1138,8 +1141,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
    * EU device-specific information extensions. Some of the queries are
    * enabled by cl_intel_device_attribute_query extension, but it's not yet in
    * the Registry. */
-  case UR_DEVICE_INFO_COMPONENT_DEVICES:
-  case UR_DEVICE_INFO_COMPOSITE_DEVICE:
   case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
   /* This enums have no equivalent in OpenCL */
@@ -1147,7 +1148,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GLOBAL_MEM_FREE:
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE:
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH:
-  case UR_DEVICE_INFO_ASYNC_BARRIER:
+  case UR_DEVICE_INFO_COMPONENT_DEVICES:
+  case UR_DEVICE_INFO_COMPOSITE_DEVICE:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP: {
     bool Is2DBlockIOSupported = false;
diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp
index dfcc1dfafa..7961cb76ff 100644
--- a/source/adapters/opencl/usm.cpp
+++ b/source/adapters/opencl/usm.cpp
@@ -669,7 +669,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
     PropNameCL = CL_MEM_ALLOC_DEVICE_INTEL;
     break;
   default:
-    return UR_RESULT_ERROR_INVALID_VALUE;
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   }
 
   size_t CheckPropSize = 0;
diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp
index 63c3cbfca1..280948bc13 100644
--- a/test/conformance/adapter/urAdapterGetInfo.cpp
+++ b/test/conformance/adapter/urAdapterGetInfo.cpp
@@ -37,7 +37,8 @@ INSTANTIATE_TEST_SUITE_P(
 TEST_P(urAdapterGetInfoTest, Success) {
     size_t size = 0;
     ur_adapter_info_t info_type = GetParam();
-    ASSERT_SUCCESS(urAdapterGetInfo(adapter, info_type, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urAdapterGetInfo(adapter, info_type, 0, nullptr, &size), info_type);
     ASSERT_NE(size, 0);
 
     if (const auto expected_size = adapter_info_size_map.find(info_type);
diff --git a/test/conformance/context/urContextGetInfo.cpp b/test/conformance/context/urContextGetInfo.cpp
index 46bc2cd179..bd6bdd3c3d 100644
--- a/test/conformance/context/urContextGetInfo.cpp
+++ b/test/conformance/context/urContextGetInfo.cpp
@@ -66,8 +66,8 @@ UUR_TEST_SUITE_P(urContextGetInfoTestWithInfoParam,
 TEST_P(urContextGetInfoTestWithInfoParam, Success) {
     ur_context_info_t info = getParam();
     size_t info_size = 0;
-    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-        urContextGetInfo(context, info, 0, nullptr, &info_size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urContextGetInfo(context, info, 0, nullptr, &info_size), info);
     ASSERT_NE(info_size, 0);
 
     if (const auto expected_size = ctx_info_size_map.find(info);
diff --git a/test/conformance/device/device_adapter_cuda.match b/test/conformance/device/device_adapter_cuda.match
index 48e00debe4..e42f948b73 100644
--- a/test/conformance/device/device_adapter_cuda.match
+++ b/test/conformance/device/device_adapter_cuda.match
@@ -1 +1,2 @@
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
+urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS
diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match
index 48e00debe4..87140fb10c 100644
--- a/test/conformance/device/device_adapter_level_zero.match
+++ b/test/conformance/device/device_adapter_level_zero.match
@@ -1 +1,2 @@
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
+{{OPT}}urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE
diff --git a/test/conformance/device/device_adapter_native_cpu.match b/test/conformance/device/device_adapter_native_cpu.match
index 00b3642e71..da07da18dc 100644
--- a/test/conformance/device/device_adapter_native_cpu.match
+++ b/test/conformance/device/device_adapter_native_cpu.match
@@ -3,27 +3,15 @@ urDeviceCreateWithNativeHandleTest.InvalidNullPointerDevice
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
 urDeviceGetInfoSingleTest.MaxWorkGroupSizeIsNonzero
 {{OPT}}urDeviceSelectBinaryTest.Success
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_DEVICE_ID
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEMORY_CLOCK_RATE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IL_VERSION
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_UUID
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PCI_ADDRESS
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_EU_COUNT
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_EU_SLICES
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEMORY_BUS_WIDTH
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_WORK_GROUPS_3D
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP
+urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES
+urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS
diff --git a/test/conformance/device/urDeviceGetInfo.cpp b/test/conformance/device/urDeviceGetInfo.cpp
index 23a2f7f237..1fc8e6ca7f 100644
--- a/test/conformance/device/urDeviceGetInfo.cpp
+++ b/test/conformance/device/urDeviceGetInfo.cpp
@@ -4,7 +4,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <array>
-#include <map>
 #include <uur/fixtures.h>
 
 static std::unordered_map<ur_device_info_t, size_t> device_info_size_map = {
@@ -117,7 +116,14 @@ static std::unordered_map<ur_device_info_t, size_t> device_info_size_map = {
     {UR_DEVICE_INFO_COMPOSITE_DEVICE, sizeof(ur_device_handle_t)},
     {UR_DEVICE_INFO_USM_POOL_SUPPORT, sizeof(ur_bool_t)},
     {UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP,
-     sizeof(ur_exp_device_2d_block_array_capability_flags_t)}};
+     sizeof(ur_exp_device_2d_block_array_capability_flags_t)},
+    {UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES,
+     sizeof(ur_memory_order_capability_flags_t)},
+    {UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES,
+     sizeof(ur_memory_scope_capability_flags_t)},
+    {UR_DEVICE_INFO_ESIMD_SUPPORT, sizeof(ur_bool_t)},
+    {UR_DEVICE_INFO_IP_VERSION, sizeof(uint32_t)},
+    {UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT, sizeof(ur_bool_t)}};
 
 struct urDeviceGetInfoTest : uur::urAllDevicesTest,
                              ::testing::WithParamInterface<ur_device_info_t> {
@@ -238,9 +244,16 @@ INSTANTIATE_TEST_SUITE_P(
         UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED,         //
         UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP,           //
         UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT,                 //
-        UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS,    //
         UR_DEVICE_INFO_USM_POOL_SUPPORT,                       //
-        UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP         //
+        UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES,        //
+        UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES,        //
+        UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,       //
+        UR_DEVICE_INFO_IP_VERSION,                             //
+        UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS,    //
+        UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP,        //
+        UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE,          //
+        UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF,            //
+        UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT              //
         ),
     [](const ::testing::TestParamInfo<ur_device_info_t> &info) {
         std::stringstream ss;
@@ -266,32 +279,27 @@ TEST_P(urDeviceGetInfoTest, Success) {
     ur_device_info_t info_type = GetParam();
     for (auto device : devices) {
         size_t size = 0;
-        ur_result_t result =
-            urDeviceGetInfo(device, info_type, 0, nullptr, &size);
+        ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+            urDeviceGetInfo(device, info_type, 0, nullptr, &size), info_type);
 
-        if (result == UR_RESULT_SUCCESS) {
-            if (doesReturnArray(info_type) && size == 0) {
-                return;
-            }
-            ASSERT_NE(size, 0);
-
-            if (const auto expected_size = device_info_size_map.find(info_type);
-                expected_size != device_info_size_map.end()) {
-                ASSERT_EQ(expected_size->second, size);
-            }
+        if (doesReturnArray(info_type) && size == 0) {
+            return;
+        }
+        ASSERT_NE(size, 0);
 
-            std::vector<char> info_data(size);
-            ASSERT_SUCCESS(urDeviceGetInfo(device, info_type, size,
-                                           info_data.data(), nullptr));
+        if (const auto expected_size = device_info_size_map.find(info_type);
+            expected_size != device_info_size_map.end()) {
+            ASSERT_EQ(expected_size->second, size);
+        }
 
-            if (info_type == UR_DEVICE_INFO_PLATFORM) {
-                auto returned_platform =
-                    reinterpret_cast<ur_platform_handle_t *>(info_data.data());
-                ASSERT_EQ(*returned_platform, platform);
-            }
+        std::vector<char> info_data(size);
+        ASSERT_SUCCESS(urDeviceGetInfo(device, info_type, size,
+                                       info_data.data(), nullptr));
 
-        } else {
-            ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+        if (info_type == UR_DEVICE_INFO_PLATFORM) {
+            auto returned_platform =
+                reinterpret_cast<ur_platform_handle_t *>(info_data.data());
+            ASSERT_EQ(*returned_platform, platform);
         }
     }
 }
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match
index 4155859eaf..fdfbe01690 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match
@@ -53,19 +53,9 @@
 {{OPT}}urEnqueueMemImageWriteTest.InvalidRegion2D/*
 {{OPT}}urEnqueueMemImageWriteTest.InvalidRegion3D/*
 {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/*
-urEnqueueReadHostPipeTest.InvalidEventWaitList/*
-urEnqueueReadHostPipeTest.InvalidNullHandleProgram/*
-urEnqueueReadHostPipeTest.InvalidNullHandleQueue/*
-urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/*
-urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/*
 urEnqueueUSMAdviseTest.InvalidSizeTooLarge/*
 urEnqueueUSMFill2DNegativeTest.OutOfBounds/*
 {{OPT}}urEnqueueUSMMemcpyTest.Blocking/*
 {{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/*
 {{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/*
 urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/*
-urEnqueueWriteHostPipeTest.InvalidEventWaitList/*
-urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/*
-urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/*
-urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/*
-urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/*
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
index 7b1739df4e..f0af10a448 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
@@ -71,14 +71,4 @@ urEnqueueKernelLaunchKernelWgSizeTest.Success/*
 urEnqueueUSMFill2DNegativeTest.OutOfBounds/*
 urEnqueueUSMAdviseTest.InvalidSizeTooLarge/*
 urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/*
-urEnqueueReadHostPipeTest.InvalidNullHandleQueue/*
-urEnqueueReadHostPipeTest.InvalidNullHandleProgram/*
-urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/*
-urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/*
-urEnqueueReadHostPipeTest.InvalidEventWaitList/*
-urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/*
-urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/*
-urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/*
-urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/*
-urEnqueueWriteHostPipeTest.InvalidEventWaitList/*
 {{OPT}}urEnqueueTimestampRecordingExpTest.SuccessBlocking/*
diff --git a/test/conformance/event/urEventGetInfo.cpp b/test/conformance/event/urEventGetInfo.cpp
index 4cca805cd0..d2ff63657b 100644
--- a/test/conformance/event/urEventGetInfo.cpp
+++ b/test/conformance/event/urEventGetInfo.cpp
@@ -11,7 +11,8 @@ TEST_P(urEventGetInfoTest, Success) {
 
     ur_event_info_t info_type = getParam();
     size_t size;
-    ASSERT_SUCCESS(urEventGetInfo(event, info_type, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urEventGetInfo(event, info_type, 0, nullptr, &size), info_type);
     ASSERT_NE(size, 0);
     std::vector<uint8_t> data(size);
     ASSERT_SUCCESS(
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index 6289de7b9e..73e3db2d18 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -12,8 +12,9 @@ TEST_P(urEventGetProfilingInfoTest, Success) {
 
     ur_profiling_info_t info_type = getParam();
     size_t size;
-    ASSERT_SUCCESS(
-        urEventGetProfilingInfo(event, info_type, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urEventGetProfilingInfo(event, info_type, 0, nullptr, &size),
+        info_type);
     ASSERT_EQ(size, 8);
 
     std::vector<uint8_t> data(size);
diff --git a/test/conformance/kernel/kernel_adapter_opencl.match b/test/conformance/kernel/kernel_adapter_opencl.match
deleted file mode 100644
index 687d7be2e7..0000000000
--- a/test/conformance/kernel/kernel_adapter_opencl.match
+++ /dev/null
@@ -1 +0,0 @@
-urKernelGetInfoTest.Success/*_UR_KERNEL_INFO_NUM_REGS
diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp
index 2b3c70c22e..35e837e97a 100644
--- a/test/conformance/kernel/urKernelGetGroupInfo.cpp
+++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp
@@ -43,16 +43,14 @@ TEST_P(urKernelGetGroupInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
     std::vector<char> property_value;
-    auto result = urKernelGetGroupInfo(kernel, device, property_name, 0,
-                                       nullptr, &property_size);
-    if (result == UR_RESULT_SUCCESS) {
-        property_value.resize(property_size);
-        ASSERT_SUCCESS(urKernelGetGroupInfo(kernel, device, property_name,
-                                            property_size,
-                                            property_value.data(), nullptr));
-    } else {
-        ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
-    }
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urKernelGetGroupInfo(kernel, device, property_name, 0, nullptr,
+                             &property_size),
+        property_name);
+    property_value.resize(property_size);
+    ASSERT_SUCCESS(urKernelGetGroupInfo(kernel, device, property_name,
+                                        property_size, property_value.data(),
+                                        nullptr));
 }
 
 TEST_P(urKernelGetGroupInfoTest, InvalidNullHandleKernel) {
diff --git a/test/conformance/kernel/urKernelGetInfo.cpp b/test/conformance/kernel/urKernelGetInfo.cpp
index e87ab1da13..4749abc367 100644
--- a/test/conformance/kernel/urKernelGetInfo.cpp
+++ b/test/conformance/kernel/urKernelGetInfo.cpp
@@ -22,8 +22,9 @@ TEST_P(urKernelGetInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
     std::vector<char> property_value;
-    ASSERT_SUCCESS(
-        urKernelGetInfo(kernel, property_name, 0, nullptr, &property_size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urKernelGetInfo(kernel, property_name, 0, nullptr, &property_size),
+        property_name);
     property_value.resize(property_size);
     ASSERT_SUCCESS(urKernelGetInfo(kernel, property_name, property_size,
                                    property_value.data(), nullptr));
diff --git a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
index fa4e045483..f2fc8f0197 100644
--- a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
+++ b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
@@ -27,8 +27,10 @@ TEST_P(urKernelGetSubGroupInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
     std::vector<char> property_value;
-    ASSERT_SUCCESS(urKernelGetSubGroupInfo(kernel, device, property_name, 0,
-                                           nullptr, &property_size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urKernelGetSubGroupInfo(kernel, device, property_name, 0, nullptr,
+                                &property_size),
+        property_name);
     property_value.resize(property_size);
     ASSERT_SUCCESS(urKernelGetSubGroupInfo(kernel, device, property_name,
                                            property_size, property_value.data(),
diff --git a/test/conformance/memory/urMemGetInfo.cpp b/test/conformance/memory/urMemGetInfo.cpp
index 3f933d39a9..ddcc2761d0 100644
--- a/test/conformance/memory/urMemGetInfo.cpp
+++ b/test/conformance/memory/urMemGetInfo.cpp
@@ -23,7 +23,8 @@ UUR_TEST_SUITE_P(urMemGetInfoTestWithParam,
 TEST_P(urMemGetInfoTestWithParam, Success) {
     ur_mem_info_t info = getParam();
     size_t size;
-    ASSERT_SUCCESS(urMemGetInfo(buffer, info, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urMemGetInfo(buffer, info, 0, nullptr, &size), info);
     ASSERT_NE(size, 0);
 
     if (const auto expected_size = mem_info_size_map.find(info);
diff --git a/test/conformance/memory/urMemImageGetInfo.cpp b/test/conformance/memory/urMemImageGetInfo.cpp
index ae85720220..1a67e42fad 100644
--- a/test/conformance/memory/urMemImageGetInfo.cpp
+++ b/test/conformance/memory/urMemImageGetInfo.cpp
@@ -28,7 +28,8 @@ UUR_TEST_SUITE_P(urMemImageGetInfoTest,
 TEST_P(urMemImageGetInfoTest, Success) {
     ur_image_info_t info = getParam();
     size_t size = 0;
-    ASSERT_SUCCESS(urMemImageGetInfo(image, info, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urMemImageGetInfo(image, info, 0, nullptr, &size), info);
     ASSERT_NE(size, 0);
 
     if (const auto expected_size = image_info_size_map.find(info);
diff --git a/test/conformance/platform/urPlatformGetInfo.cpp b/test/conformance/platform/urPlatformGetInfo.cpp
index 1dc92b26d7..0633e420f6 100644
--- a/test/conformance/platform/urPlatformGetInfo.cpp
+++ b/test/conformance/platform/urPlatformGetInfo.cpp
@@ -29,7 +29,8 @@ INSTANTIATE_TEST_SUITE_P(
 TEST_P(urPlatformGetInfoTest, Success) {
     size_t size = 0;
     ur_platform_info_t info_type = GetParam();
-    ASSERT_SUCCESS(urPlatformGetInfo(platform, info_type, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urPlatformGetInfo(platform, info_type, 0, nullptr, &size), info_type);
     if (info_type == UR_PLATFORM_INFO_BACKEND) {
         ASSERT_EQ(size, sizeof(ur_platform_backend_t));
     } else {
diff --git a/test/conformance/program/program_adapter_level_zero.match b/test/conformance/program/program_adapter_level_zero.match
index 97d6869b81..8cb53734bd 100644
--- a/test/conformance/program/program_adapter_level_zero.match
+++ b/test/conformance/program/program_adapter_level_zero.match
@@ -1,3 +1,4 @@
 urProgramSetSpecializationConstantsTest.InvalidValueSize/*
 urProgramSetSpecializationConstantsTest.InvalidValueId/*
 urProgramSetSpecializationConstantsTest.InvalidValuePtr/*
+urProgramGetBuildInfoTest.Success/*UR_PROGRAM_BUILD_INFO_STATUS
diff --git a/test/conformance/program/program_adapter_level_zero_v2.match b/test/conformance/program/program_adapter_level_zero_v2.match
index fd359b3653..920d5f1b5b 100644
--- a/test/conformance/program/program_adapter_level_zero_v2.match
+++ b/test/conformance/program/program_adapter_level_zero_v2.match
@@ -1,4 +1,5 @@
 urProgramSetSpecializationConstantsTest.InvalidValueSize/*
 urProgramSetSpecializationConstantsTest.InvalidValueId/*
 urProgramSetSpecializationConstantsTest.InvalidValuePtr/*
+urProgramGetBuildInfoTest.Success/*UR_PROGRAM_BUILD_INFO_STATUS
 {{OPT}}urMultiDeviceCommandBufferExpTest.*
diff --git a/test/conformance/program/urProgramGetBuildInfo.cpp b/test/conformance/program/urProgramGetBuildInfo.cpp
index cf4e9b9217..df46467197 100644
--- a/test/conformance/program/urProgramGetBuildInfo.cpp
+++ b/test/conformance/program/urProgramGetBuildInfo.cpp
@@ -33,19 +33,10 @@ TEST_P(urProgramGetBuildInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
     std::vector<char> property_value;
-    ur_platform_backend_t backend;
-    ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
-                                     sizeof(backend), &backend, nullptr));
-    auto result = urProgramGetBuildInfo(program, device, property_name, 0,
-                                        nullptr, &property_size);
-
-    if (property_name == UR_PROGRAM_BUILD_INFO_STATUS &&
-        backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
-        ASSERT_EQ(UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION, result);
-        return;
-    }
-
-    ASSERT_SUCCESS(result);
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urProgramGetBuildInfo(program, device, property_name, 0, nullptr,
+                              &property_size),
+        property_name);
     property_value.resize(property_size);
     ASSERT_SUCCESS(urProgramGetBuildInfo(program, device, property_name,
                                          property_size, property_value.data(),
diff --git a/test/conformance/program/urProgramGetInfo.cpp b/test/conformance/program/urProgramGetInfo.cpp
index 7b2e6f1873..176a278583 100644
--- a/test/conformance/program/urProgramGetInfo.cpp
+++ b/test/conformance/program/urProgramGetInfo.cpp
@@ -52,15 +52,18 @@ TEST_P(urProgramGetInfoTest, Success) {
                                         sizeof(binaries[0]), binaries,
                                         nullptr));
     } else {
-        auto result = urProgramGetInfo(program, property_name, 0, nullptr,
-                                       &property_size);
-        if (result != UR_RESULT_SUCCESS) {
-            ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
-            return;
+        ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+            urProgramGetInfo(program, property_name, 0, nullptr,
+                             &property_size),
+            property_name);
+        if (property_size) {
+            property_value.resize(property_size);
+            ASSERT_SUCCESS(urProgramGetInfo(program, property_name,
+                                            property_size,
+                                            property_value.data(), nullptr));
+        } else {
+            ASSERT_EQ(property_name, UR_PROGRAM_INFO_IL);
         }
-        property_value.resize(property_size);
-        ASSERT_SUCCESS(urProgramGetInfo(program, property_name, property_size,
-                                        property_value.data(), nullptr));
     }
     switch (property_name) {
     case UR_PROGRAM_INFO_REFERENCE_COUNT: {
@@ -108,7 +111,11 @@ TEST_P(urProgramGetInfoTest, Success) {
         break;
     }
     case UR_PROGRAM_INFO_IL: {
-        ASSERT_EQ(property_value, *il_binary.get());
+        // Some adapters only support ProgramCreateWithBinary, in those cases we
+        // expect a return size of 0 and an empty return value for INFO_IL.
+        if (!property_value.empty()) {
+            ASSERT_EQ(property_value, *il_binary.get());
+        }
         break;
     }
     default:
diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match
index 1c48a80fed..8bac2b269e 100644
--- a/test/conformance/queue/queue_adapter_native_cpu.match
+++ b/test/conformance/queue/queue_adapter_native_cpu.match
@@ -8,6 +8,7 @@ urQueueGetInfoTest.Device/*
 urQueueGetInfoTest.Flags/*
 urQueueGetInfoTest.ReferenceCount/*
 urQueueGetInfoTest.InvalidSizeSmall/*
+urQueueGetInfoTest.EmptyQueue/*
 urQueueGetInfoDeviceQueueTestWithInfoParam.DeviceDefault/*
 urQueueGetInfoDeviceQueueTestWithInfoParam.Size/*
 urQueueRetainTest.Success/*
diff --git a/test/conformance/queue/urQueueGetInfo.cpp b/test/conformance/queue/urQueueGetInfo.cpp
index e3330fef6f..57fdd954d1 100644
--- a/test/conformance/queue/urQueueGetInfo.cpp
+++ b/test/conformance/queue/urQueueGetInfo.cpp
@@ -2,7 +2,6 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#include <map>
 #include <uur/fixtures.h>
 
 using urQueueGetInfoTest = uur::urQueueTest;
@@ -11,7 +10,8 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueGetInfoTest);
 TEST_P(urQueueGetInfoTest, Context) {
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_CONTEXT;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(ur_context_handle_t), size);
 
@@ -26,7 +26,8 @@ TEST_P(urQueueGetInfoTest, Context) {
 TEST_P(urQueueGetInfoTest, Device) {
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_DEVICE;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(ur_device_handle_t), size);
 
@@ -40,7 +41,8 @@ TEST_P(urQueueGetInfoTest, Device) {
 TEST_P(urQueueGetInfoTest, Flags) {
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_FLAGS;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(ur_queue_flags_t), size);
 
@@ -54,7 +56,8 @@ TEST_P(urQueueGetInfoTest, Flags) {
 TEST_P(urQueueGetInfoTest, ReferenceCount) {
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_REFERENCE_COUNT;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(uint32_t), size);
 
@@ -68,8 +71,8 @@ TEST_P(urQueueGetInfoTest, ReferenceCount) {
 TEST_P(urQueueGetInfoTest, EmptyQueue) {
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_EMPTY;
-    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-        urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(ur_bool_t), size);
 
@@ -159,7 +162,8 @@ TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, DeviceDefault) {
 
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_DEVICE_DEFAULT;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(ur_queue_handle_t), size);
 
@@ -174,7 +178,8 @@ TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, Size) {
 
     size_t size = 0;
     auto infoType = UR_QUEUE_INFO_SIZE;
-    ASSERT_SUCCESS(urQueueGetInfo(queue, infoType, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urQueueGetInfo(queue, infoType, 0, nullptr, &size), infoType);
     ASSERT_NE(size, 0);
     ASSERT_EQ(sizeof(uint32_t), size);
 
diff --git a/test/conformance/sampler/urSamplerGetInfo.cpp b/test/conformance/sampler/urSamplerGetInfo.cpp
index e1f9326df7..9cf1a3f7ec 100644
--- a/test/conformance/sampler/urSamplerGetInfo.cpp
+++ b/test/conformance/sampler/urSamplerGetInfo.cpp
@@ -18,7 +18,8 @@ UUR_TEST_SUITE_P(urSamplerGetInfoTestWithParam,
 TEST_P(urSamplerGetInfoTestWithParam, Success) {
     size_t size = 0;
     ur_sampler_info_t info = getParam();
-    ASSERT_SUCCESS(urSamplerGetInfo(sampler, info, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urSamplerGetInfo(sampler, info, 0, nullptr, &size), info);
     ASSERT_NE(size, 0);
     std::vector<uint8_t> infoData(size);
     ASSERT_SUCCESS(
diff --git a/test/conformance/testing/include/uur/checks.h b/test/conformance/testing/include/uur/checks.h
index 2ad3925842..c0adb12453 100644
--- a/test/conformance/testing/include/uur/checks.h
+++ b/test/conformance/testing/include/uur/checks.h
@@ -9,6 +9,7 @@
 #include <gtest/gtest.h>
 #include <ur_api.h>
 #include <ur_print.hpp>
+#include <uur/optional_queries.h>
 #include <uur/utils.h>
 
 namespace uur {
@@ -46,6 +47,21 @@ inline std::ostream &operator<<(std::ostream &out, const Result &result) {
 #define EXPECT_SUCCESS(ACTUAL) EXPECT_EQ_RESULT(UR_RESULT_SUCCESS, ACTUAL)
 #endif
 
+// This macro is intended to be used for the first call to a GetInfo query, it
+// gracefully handles cases where the adapter doesn't support a query marked
+// [optional-query] in the spec by returning early.
+#ifndef ASSERT_SUCCESS_OR_OPTIONAL_QUERY
+#define ASSERT_SUCCESS_OR_OPTIONAL_QUERY(CALL, QUERY)                          \
+    do {                                                                       \
+        auto result = CALL;                                                    \
+        if (result != UR_RESULT_SUCCESS) {                                     \
+            ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); \
+            ASSERT_TRUE(uur::isQueryOptional(QUERY));                          \
+            return;                                                            \
+        }                                                                      \
+    } while (0)
+#endif
+
 inline std::ostream &operator<<(std::ostream &out,
                                 const ur_device_handle_t &device) {
     out << uur::GetDeviceName(device);
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 219434fb62..e5c5d39591 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -1098,6 +1098,12 @@ struct urUSMDeviceAllocTestWithParam : urQueueTestWithParam<T> {
             GTEST_SKIP() << "Device USM in not supported";
         }
         if (use_pool) {
+            ur_bool_t poolSupport = false;
+            ASSERT_SUCCESS(
+                uur::GetDeviceUSMPoolSupport(this->device, poolSupport));
+            if (!poolSupport) {
+                GTEST_SKIP() << "USM pools are not supported.";
+            }
             ur_usm_pool_desc_t pool_desc = {};
             ASSERT_SUCCESS(urUSMPoolCreate(this->context, &pool_desc, &pool));
         }
diff --git a/test/conformance/testing/include/uur/optional_queries.h b/test/conformance/testing/include/uur/optional_queries.h
new file mode 100644
index 0000000000..a47e37a4e0
--- /dev/null
+++ b/test/conformance/testing/include/uur/optional_queries.h
@@ -0,0 +1,117 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file optional_queries.h
+ *
+ */
+
+// Auto-generated file, do not edit.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <ur_api.h>
+
+namespace uur {
+
+template <class T> bool isQueryOptional(T) { return false; }
+
+constexpr std::array optional_ur_device_info_t = {
+    UR_DEVICE_INFO_DEVICE_ID,
+    UR_DEVICE_INFO_MEMORY_CLOCK_RATE,
+    UR_DEVICE_INFO_GLOBAL_MEM_FREE,
+    UR_DEVICE_INFO_UUID,
+    UR_DEVICE_INFO_PCI_ADDRESS,
+    UR_DEVICE_INFO_GPU_EU_COUNT,
+    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH,
+    UR_DEVICE_INFO_GPU_EU_SLICES,
+    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
+    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE,
+    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU,
+    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH,
+    UR_DEVICE_INFO_MEMORY_BUS_WIDTH,
+    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP,
+    UR_DEVICE_INFO_IP_VERSION,
+    UR_DEVICE_INFO_COMPONENT_DEVICES,
+    UR_DEVICE_INFO_COMPOSITE_DEVICE,
+};
+
+template <> inline bool isQueryOptional(ur_device_info_t query) {
+    return std::find(optional_ur_device_info_t.begin(),
+                     optional_ur_device_info_t.end(),
+                     query) != optional_ur_device_info_t.end();
+}
+
+constexpr std::array optional_ur_context_info_t = {
+    UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES,
+    UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
+    UR_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES,
+    UR_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES,
+};
+
+template <> inline bool isQueryOptional(ur_context_info_t query) {
+    return std::find(optional_ur_context_info_t.begin(),
+                     optional_ur_context_info_t.end(),
+                     query) != optional_ur_context_info_t.end();
+}
+
+constexpr std::array optional_ur_usm_alloc_info_t = {
+    UR_USM_ALLOC_INFO_POOL,
+};
+
+template <> inline bool isQueryOptional(ur_usm_alloc_info_t query) {
+    return std::find(optional_ur_usm_alloc_info_t.begin(),
+                     optional_ur_usm_alloc_info_t.end(),
+                     query) != optional_ur_usm_alloc_info_t.end();
+}
+
+constexpr std::array optional_ur_program_info_t = {
+    UR_PROGRAM_INFO_NUM_KERNELS,
+    UR_PROGRAM_INFO_KERNEL_NAMES,
+};
+
+template <> inline bool isQueryOptional(ur_program_info_t query) {
+    return std::find(optional_ur_program_info_t.begin(),
+                     optional_ur_program_info_t.end(),
+                     query) != optional_ur_program_info_t.end();
+}
+
+constexpr std::array optional_ur_kernel_info_t = {
+    UR_KERNEL_INFO_NUM_REGS,
+};
+
+template <> inline bool isQueryOptional(ur_kernel_info_t query) {
+    return std::find(optional_ur_kernel_info_t.begin(),
+                     optional_ur_kernel_info_t.end(),
+                     query) != optional_ur_kernel_info_t.end();
+}
+
+constexpr std::array optional_ur_kernel_group_info_t = {
+    UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE,
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE,
+};
+
+template <> inline bool isQueryOptional(ur_kernel_group_info_t query) {
+    return std::find(optional_ur_kernel_group_info_t.begin(),
+                     optional_ur_kernel_group_info_t.end(),
+                     query) != optional_ur_kernel_group_info_t.end();
+}
+
+constexpr std::array optional_ur_queue_info_t = {
+    UR_QUEUE_INFO_EMPTY,
+};
+
+template <> inline bool isQueryOptional(ur_queue_info_t query) {
+    return std::find(optional_ur_queue_info_t.begin(),
+                     optional_ur_queue_info_t.end(),
+                     query) != optional_ur_queue_info_t.end();
+}
+
+} // namespace uur
diff --git a/test/conformance/usm/urUSMGetMemAllocInfo.cpp b/test/conformance/usm/urUSMGetMemAllocInfo.cpp
index 181e1d08b3..38719379a9 100644
--- a/test/conformance/usm/urUSMGetMemAllocInfo.cpp
+++ b/test/conformance/usm/urUSMGetMemAllocInfo.cpp
@@ -33,8 +33,9 @@ static std::unordered_map<ur_usm_alloc_info_t, size_t> usm_info_size_map = {
 TEST_P(urUSMGetMemAllocInfoTest, Success) {
     size_t size = 0;
     auto alloc_info = getParam();
-    ASSERT_SUCCESS(
-        urUSMGetMemAllocInfo(context, ptr, alloc_info, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urUSMGetMemAllocInfo(context, ptr, alloc_info, 0, nullptr, &size),
+        alloc_info);
     ASSERT_NE(size, 0);
 
     if (const auto expected_size = usm_info_size_map.find(alloc_info);
diff --git a/test/conformance/usm/urUSMPoolGetInfo.cpp b/test/conformance/usm/urUSMPoolGetInfo.cpp
index 945ce1b0b2..e43bc3a77b 100644
--- a/test/conformance/usm/urUSMPoolGetInfo.cpp
+++ b/test/conformance/usm/urUSMPoolGetInfo.cpp
@@ -22,7 +22,8 @@ UUR_TEST_SUITE_P(urUSMPoolGetInfoTestWithInfoParam,
 TEST_P(urUSMPoolGetInfoTestWithInfoParam, Success) {
     ur_usm_pool_info_t info_type = getParam();
     size_t size = 0;
-    ASSERT_SUCCESS(urUSMPoolGetInfo(pool, info_type, 0, nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urUSMPoolGetInfo(pool, info_type, 0, nullptr, &size), info_type);
     ASSERT_NE(size, 0);
 
     if (const auto expected_size = pool_info_size_map.find(info_type);
diff --git a/test/conformance/usm/usm_adapter_native_cpu.match b/test/conformance/usm/usm_adapter_native_cpu.match
index 603a25b3e2..ada1fe040e 100644
--- a/test/conformance/usm/usm_adapter_native_cpu.match
+++ b/test/conformance/usm/usm_adapter_native_cpu.match
@@ -8,7 +8,6 @@ urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_TYPE
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_BASE_PTR
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_SIZE
 urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_DEVICE
-urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_POOL
 urUSMGetMemAllocInfoNegativeTest.InvalidNullHandleContext/*
 urUSMGetMemAllocInfoNegativeTest.InvalidNullPointerMem/*
 urUSMGetMemAllocInfoNegativeTest.InvalidEnumeration/*
diff --git a/test/conformance/usm/usm_adapter_opencl.match b/test/conformance/usm/usm_adapter_opencl.match
deleted file mode 100644
index 2fffa9b0ed..0000000000
--- a/test/conformance/usm/usm_adapter_opencl.match
+++ /dev/null
@@ -1 +0,0 @@
-urUSMGetMemAllocInfoTest.Success/*__UR_USM_ALLOC_INFO_POOL
diff --git a/test/conformance/virtual_memory/urVirtualMemGetInfo.cpp b/test/conformance/virtual_memory/urVirtualMemGetInfo.cpp
index 79579e9297..041a749b5b 100644
--- a/test/conformance/virtual_memory/urVirtualMemGetInfo.cpp
+++ b/test/conformance/virtual_memory/urVirtualMemGetInfo.cpp
@@ -13,8 +13,10 @@ UUR_TEST_SUITE_P(urVirtualMemGetInfoTestWithParam,
 TEST_P(urVirtualMemGetInfoTestWithParam, Success) {
     size_t info_size = 0;
     ur_virtual_mem_info_t info = getParam();
-    ASSERT_SUCCESS(urVirtualMemGetInfo(context, virtual_ptr, size, info, 0,
-                                       nullptr, &info_size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(urVirtualMemGetInfo(context, virtual_ptr,
+                                                         size, info, 0, nullptr,
+                                                         &info_size),
+                                     info);
     ASSERT_NE(info_size, 0);
 
     std::vector<uint8_t> data(info_size);
diff --git a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp
index ddd1143a21..2fac7a1093 100644
--- a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp
+++ b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp
@@ -28,8 +28,10 @@ UUR_TEST_SUITE_P(
 TEST_P(urVirtualMemGranularityGetInfoTest, Success) {
     size_t size = 0;
     ur_virtual_mem_granularity_info_t info = getParam();
-    ASSERT_SUCCESS(urVirtualMemGranularityGetInfo(context, device, info, 0,
-                                                  nullptr, &size));
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urVirtualMemGranularityGetInfo(context, device, info, 0, nullptr,
+                                       &size),
+        info);
     ASSERT_NE(size, 0);
 
     std::vector<uint8_t> infoData(size);

From e625bfdaa99bdffd82de9d59dddda7c9897fa107 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Mon, 16 Dec 2024 16:31:30 +0000
Subject: [PATCH 136/148] [L0] Fix issue with test include path

- Fix include path for the L0 tracing layer header which prevents it from being picked up from the deps downloaded during project build.
---
 test/adapters/level_zero/ze_tracer_common.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/adapters/level_zero/ze_tracer_common.hpp b/test/adapters/level_zero/ze_tracer_common.hpp
index ed33eb30a5..9d25a5f452 100644
--- a/test/adapters/level_zero/ze_tracer_common.hpp
+++ b/test/adapters/level_zero/ze_tracer_common.hpp
@@ -5,7 +5,7 @@
 
 #include "uur/fixtures.h"
 
-#include <level_zero/layers/zel_tracing_api.h>
+#include <layers/zel_tracing_api.h>
 #include <loader/ze_loader.h>
 
 #include <memory>

From 76054dd1c20598b0ff9975e5a14e74f77b7c9a35 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Mon, 2 Dec 2024 13:39:06 +0000
Subject: [PATCH 137/148] [CUDA] Fix potential issue with command buffer fills
 on CUDA

- Fix a potential issue where decomposed fill nodes for large patterns would overwrite external event dependencies provided by the user when stored in a command handle
- Also store decomposed nodes in fill command handles for future use when updating.
- Add missing event_sync tests for large pattern fills (> 4 bytes)
---
 source/adapters/cuda/command_buffer.cpp       |  45 ++++---
 source/adapters/cuda/command_buffer.hpp       |  26 +++-
 .../exp_command_buffer/event_sync.cpp         |  72 ++++++++++
 ...command_buffer_adapter_level_zero_v2.match |   4 +
 .../exp_command_buffer/update/event_sync.cpp  | 124 ++++++++++++++++++
 5 files changed, 249 insertions(+), 22 deletions(-)

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 9d54422981..35dce59002 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -236,16 +236,29 @@ static ur_result_t enqueueCommandBufferFillHelper(
                                                EventWaitList));
   }
 
+  // CUDA has no memset functions that allow setting values more than 4
+  // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
+  // fill, which can be more than 4 bytes. Calculate the number of steps
+  // required here to see if decomposing to multiple fill nodes is required.
+  size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+
   // Graph node added to graph, if multiple nodes are created this will
   // be set to the leaf node
   CUgraphNode GraphNode;
+  // Track if multiple nodes are created so we can pass them to the command
+  // handle
+  std::vector<CUgraphNode> DecomposedNodes;
+
+  if (NumberOfSteps > 4) {
+    DecomposedNodes.reserve(NumberOfSteps);
+  }
 
   const size_t N = Size / PatternSize;
   auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
                     ? *static_cast<CUdeviceptr *>(DstDevice)
                     : (CUdeviceptr)DstDevice;
 
-  if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
+  if (NumberOfSteps <= 4) {
     CUDA_MEMSET_NODE_PARAMS NodeParams = {};
     NodeParams.dst = DstPtr;
     NodeParams.elementSize = PatternSize;
@@ -276,14 +289,9 @@ static ur_result_t enqueueCommandBufferFillHelper(
         &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
         &NodeParams, CommandBuffer->Device->getNativeContext()));
   } else {
-    // CUDA has no memset functions that allow setting values more than 4
-    // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
-    // fill, which can be more than 4 bytes. We must break up the pattern
-    // into 1 byte values, and set the buffer using multiple strided calls.
-    // This means that one cuGraphAddMemsetNode call is made for every 1
-    // bytes in the pattern.
-
-    size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
+    // We must break up the rest of the pattern into 1 byte values, and set
+    // the buffer using multiple strided calls. This means that one
+    // cuGraphAddMemsetNode call is made for every 1 bytes in the pattern.
 
     // Update NodeParam
     CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
@@ -294,12 +302,13 @@ static ur_result_t enqueueCommandBufferFillHelper(
     NodeParamsStepFirst.value = *static_cast<const uint32_t *>(Pattern);
     NodeParamsStepFirst.width = 1;
 
+    // Inital decomposed node depends on the provided external event wait
+    // nodes
     UR_CHECK_ERROR(cuGraphAddMemsetNode(
         &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
         &NodeParamsStepFirst, CommandBuffer->Device->getNativeContext()));
 
-    DepsList.clear();
-    DepsList.push_back(GraphNode);
+    DecomposedNodes.push_back(GraphNode);
 
     // we walk up the pattern in 1-byte steps, and call cuMemset for each
     // 1-byte chunk of the pattern.
@@ -319,13 +328,16 @@ static ur_result_t enqueueCommandBufferFillHelper(
       NodeParamsStep.value = Value;
       NodeParamsStep.width = 1;
 
+      // Copy the last GraphNode ptr so we can pass it as the dependency for
+      // the next one
+      CUgraphNode PrevNode = GraphNode;
+
       UR_CHECK_ERROR(cuGraphAddMemsetNode(
-          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
-          DepsList.size(), &NodeParamsStep,
+          &GraphNode, CommandBuffer->CudaGraph, &PrevNode, 1, &NodeParamsStep,
           CommandBuffer->Device->getNativeContext()));
 
-      DepsList.clear();
-      DepsList.push_back(GraphNode);
+      // Store the decomposed node
+      DecomposedNodes.push_back(GraphNode);
     }
   }
 
@@ -344,7 +356,8 @@ static ur_result_t enqueueCommandBufferFillHelper(
 
   std::vector<CUgraphNode> WaitNodes =
       NumEventsInWaitList ? std::move(DepsList) : std::vector<CUgraphNode>();
-  auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes);
+  auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes,
+                          std::move(DecomposedNodes));
   CommandBuffer->CommandHandles.push_back(NewCommand);
 
   if (RetCommand) {
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index 67d725c3ad..839d3fb159 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -172,12 +172,19 @@ struct usm_memcpy_command_handle : ur_exp_command_buffer_command_handle_t_ {
 struct usm_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
   usm_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
                           CUgraphNode Node, CUgraphNode SignalNode,
-                          const std::vector<CUgraphNode> &WaitNodes)
+                          const std::vector<CUgraphNode> &WaitNodes,
+                          const std::vector<CUgraphNode> &DecomposedNodes = {})
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
-                                                WaitNodes) {}
+                                                WaitNodes),
+        DecomposedNodes(std::move(DecomposedNodes)) {}
   CommandType getCommandType() const noexcept override {
     return CommandType::USMFill;
   }
+
+  // If this fill command was decomposed into multiple nodes, this vector
+  // contains all of those nodes in the order they were added to the graph.
+  // Currently unused but will be required for updating in future.
+  std::vector<CUgraphNode> DecomposedNodes;
 };
 
 struct buffer_copy_command_handle : ur_exp_command_buffer_command_handle_t_ {
@@ -250,14 +257,21 @@ struct buffer_write_rect_command_handle
 };
 
 struct buffer_fill_command_handle : ur_exp_command_buffer_command_handle_t_ {
-  buffer_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer,
-                             CUgraphNode Node, CUgraphNode SignalNode,
-                             const std::vector<CUgraphNode> &WaitNodes)
+  buffer_fill_command_handle(
+      ur_exp_command_buffer_handle_t CommandBuffer, CUgraphNode Node,
+      CUgraphNode SignalNode, const std::vector<CUgraphNode> &WaitNodes,
+      const std::vector<CUgraphNode> &DecomposedNodes = {})
       : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode,
-                                                WaitNodes) {}
+                                                WaitNodes),
+        DecomposedNodes(std::move(DecomposedNodes)) {}
   CommandType getCommandType() const noexcept override {
     return CommandType::MemBufferFill;
   }
+
+  // If this fill command was decomposed into multiple nodes, this vector
+  // contains all of those nodes in the order they were added to the graph.
+  // Currently unused but will be required for updating in future.
+  std::vector<CUgraphNode> DecomposedNodes;
 };
 
 struct usm_prefetch_command_handle : ur_exp_command_buffer_command_handle_t_ {
diff --git a/test/conformance/exp_command_buffer/event_sync.cpp b/test/conformance/exp_command_buffer/event_sync.cpp
index a4356f8a29..d45785f2e3 100644
--- a/test/conformance/exp_command_buffer/event_sync.cpp
+++ b/test/conformance/exp_command_buffer/event_sync.cpp
@@ -75,6 +75,42 @@ TEST_P(CommandEventSyncTest, USMFillExp) {
     }
 }
 
+// Test fill using a large pattern size since implementations may need to handle
+// this differently.
+TEST_P(CommandEventSyncTest, USMFillLargePatternExp) {
+    // Device ptrs are allocated in the test fixture with 32-bit values * num
+    // elements, since we are doubling the pattern size we want to treat those
+    // device pointers as if they were created with half the number of elements.
+    constexpr size_t modifiedElementSize = elements / 2;
+    // Get wait event from queue fill on ptr 0
+    uint64_t patternX = 42;
+    ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX),
+                                    &patternX, allocation_size, 0, nullptr,
+                                    &external_events[0]));
+
+    // Test fill command overwriting ptr 0 waiting on queue event
+    uint64_t patternY = 0xA;
+    ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp(
+        cmd_buf_handle, device_ptrs[0], &patternY, sizeof(patternY),
+        allocation_size, 0, nullptr, 1, &external_events[0], nullptr,
+        &external_events[1], nullptr));
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+    ASSERT_SUCCESS(
+        urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr));
+
+    // Queue read ptr 0 based on event returned from command-buffer command
+    std::array<uint64_t, modifiedElementSize> host_enqueue_ptr{};
+    ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(),
+                                      device_ptrs[0], allocation_size, 1,
+                                      &external_events[1], nullptr));
+
+    // Verify
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+}
+
 TEST_P(CommandEventSyncTest, MemBufferCopyExp) {
     // Get wait event from queue fill on buffer 0
     uint32_t patternX = 42;
@@ -341,6 +377,42 @@ TEST_P(CommandEventSyncTest, MemBufferFillExp) {
     }
 }
 
+// Test fill using a large pattern size since implementations may need to handle
+// this differently.
+TEST_P(CommandEventSyncTest, MemBufferFillLargePatternExp) {
+    // Device buffers are allocated in the test fixture with 32-bit values * num
+    // elements, since we are doubling the pattern size we want to treat those
+    // device pointers as if they were created with half the number of elements.
+    constexpr size_t modifiedElementSize = elements / 2;
+    // Get wait event from queue fill on buffer 0
+    uint64_t patternX = 42;
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX,
+                                          sizeof(patternX), 0, allocation_size,
+                                          0, nullptr, &external_events[0]));
+
+    // Test fill command overwriting buffer 0 based on queue event
+    uint64_t patternY = 0xA;
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
+        cmd_buf_handle, buffers[0], &patternY, sizeof(patternY), 0,
+        allocation_size, 0, nullptr, 1, &external_events[0], nullptr,
+        &external_events[1], nullptr));
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
+    ASSERT_SUCCESS(
+        urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr));
+
+    // Queue read buffer 0 based on event returned from command-buffer command
+    std::array<uint64_t, modifiedElementSize> host_enqueue_ptr{};
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(
+        queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(),
+        1, &external_events[1], nullptr));
+
+    // Verify
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+}
+
 TEST_P(CommandEventSyncTest, USMPrefetchExp) {
     // Get wait event from queue fill on ptr 0
     uint32_t patternX = 42;
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
index 5cd3d2a0ff..f6efce7966 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match
@@ -33,6 +33,8 @@ CommandEventSyncTest.USMPrefetchExp/*
 CommandEventSyncTest.USMAdviseExp/*
 CommandEventSyncTest.MultipleEventCommands/*
 CommandEventSyncTest.MultipleEventCommandsBetweenCommandBuffers/*
+CommandEventSyncTest.USMFillLargePatternExp/*
+CommandEventSyncTest.MemBufferFillLargePatternExp/*
 CommandEventSyncUpdateTest.USMMemcpyExp/*
 CommandEventSyncUpdateTest.USMFillExp/*
 CommandEventSyncUpdateTest.MemBufferCopyExp/*
@@ -45,3 +47,5 @@ CommandEventSyncUpdateTest.MemBufferFillExp/*
 CommandEventSyncUpdateTest.USMPrefetchExp/*
 CommandEventSyncUpdateTest.USMAdviseExp/*
 CommandEventSyncUpdateTest.MultipleEventCommands/*
+CommandEventSyncUpdateTest.USMFillLargePatternExp/*
+CommandEventSyncUpdateTest.MemBufferFillLargePatternExp/*
diff --git a/test/conformance/exp_command_buffer/update/event_sync.cpp b/test/conformance/exp_command_buffer/update/event_sync.cpp
index 13e1bed968..98e7ef469e 100644
--- a/test/conformance/exp_command_buffer/update/event_sync.cpp
+++ b/test/conformance/exp_command_buffer/update/event_sync.cpp
@@ -129,6 +129,68 @@ TEST_P(CommandEventSyncUpdateTest, USMFillExp) {
     }
 }
 
+// Test fill using a large pattern size since implementations may need to handle
+// this differently.
+TEST_P(CommandEventSyncUpdateTest, USMFillLargePatternExp) {
+    // Device ptrs are allocated in the test fixture with 32-bit values * num
+    // elements, since we are doubling the pattern size we want to treat those
+    // device pointers as if they were created with half the number of elements.
+    constexpr size_t modifiedElementSize = elements / 2;
+    // Get wait event from queue fill on ptr 0
+    uint64_t patternX = 42;
+    ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX),
+                                    &patternX, allocation_size, 0, nullptr,
+                                    &external_events[0]));
+
+    // Test fill command overwriting ptr 0 waiting on queue event
+    uint64_t patternY = 0xA;
+    ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp(
+        updatable_cmd_buf_handle, device_ptrs[0], &patternY, sizeof(patternY),
+        allocation_size, 0, nullptr, 1, &external_events[0], nullptr,
+        &external_events[1], &command_handles[0]));
+    ASSERT_NE(nullptr, command_handles[0]);
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+
+    // Queue read ptr 0 based on event returned from command-buffer command
+    std::array<uint64_t, modifiedElementSize> host_enqueue_ptr{};
+    ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(),
+                                      device_ptrs[0], allocation_size, 1,
+                                      &external_events[1], nullptr));
+
+    // Verify
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+
+    uint64_t patternZ = 666;
+    ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternZ),
+                                    &patternZ, allocation_size, 0, nullptr,
+                                    &external_events[2]));
+
+    // Update command command-wait event to wait on fill of new value
+    ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1,
+                                                      &external_events[2]));
+
+    // Get a new signal event for command-buffer
+    ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0],
+                                                       &external_events[3]));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+
+    ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(),
+                                      device_ptrs[0], allocation_size, 1,
+                                      &external_events[3], nullptr));
+
+    // Verify update
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+}
+
 TEST_P(CommandEventSyncUpdateTest, MemBufferCopyExp) {
     // Get wait event from queue fill on buffer 0
     uint32_t patternX = 42;
@@ -532,6 +594,68 @@ TEST_P(CommandEventSyncUpdateTest, MemBufferWriteRectExp) {
     }
 }
 
+// Test fill using a large pattern size since implementations may need to handle
+// this differently.
+TEST_P(CommandEventSyncUpdateTest, MemBufferFillLargePatternExp) {
+    // Device buffers are allocated in the test fixture with 32-bit values * num
+    // elements, since we are doubling the pattern size we want to treat those
+    // device pointers as if they were created with half the number of elements.
+    constexpr size_t modifiedElementSize = elements / 2;
+    // Get wait event from queue fill on buffer 0
+    uint64_t patternX = 42;
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX,
+                                          sizeof(patternX), 0, allocation_size,
+                                          0, nullptr, &external_events[0]));
+
+    // Test fill command overwriting buffer 0 based on queue event
+    uint64_t patternY = 0xA;
+    ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
+        updatable_cmd_buf_handle, buffers[0], &patternY, sizeof(patternY), 0,
+        allocation_size, 0, nullptr, 1, &external_events[0], nullptr,
+        &external_events[1], &command_handles[0]));
+    ASSERT_NE(nullptr, command_handles[0]);
+    ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+
+    // Queue read buffer 0 based on event returned from command-buffer command
+    std::array<uint64_t, modifiedElementSize> host_enqueue_ptr{};
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(
+        queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(),
+        1, &external_events[1], nullptr));
+
+    // Verify
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+
+    uint64_t patternZ = 666;
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ,
+                                          sizeof(patternZ), 0, allocation_size,
+                                          0, nullptr, &external_events[2]));
+
+    // Update command command-wait event to wait on fill of new value
+    ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1,
+                                                      &external_events[2]));
+
+    // Get a new signal event for command-buffer
+    ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0],
+                                                       &external_events[3]));
+
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(
+        queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(),
+        1, &external_events[3], nullptr));
+
+    // Verify update
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    for (size_t i = 0; i < modifiedElementSize; i++) {
+        ASSERT_EQ(host_enqueue_ptr[i], patternY);
+    }
+}
+
 TEST_P(CommandEventSyncUpdateTest, MemBufferFillExp) {
     // Get wait event from queue fill on buffer 0
     uint32_t patternX = 42;

From 123e139daf869ea5fb8955d3b83e9f7516614b16 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Mon, 16 Dec 2024 11:12:40 -0800
Subject: [PATCH 138/148] [L0] For 3-Channel memory, disable Copy engine Usage

- Due to limitations on certain systems, disable the usage of copy
  engines given 3-channel memory
- Currently only bindless requires this check due to 3-channel only
  being used in bindless and copy engine disabled by default in
enqueueMemImageCommandHelper.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/image.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 09bdc16a64..b8bff77a2f 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -265,6 +265,16 @@ ur_result_t ze2urImageFormat(const ze_image_desc_t *ZeImageDesc,
   return UR_RESULT_SUCCESS;
 }
 
+bool Is3ChannelOrder(ur_image_channel_order_t ChannelOrder) {
+  switch (ChannelOrder) {
+  case UR_IMAGE_CHANNEL_ORDER_RGB:
+  case UR_IMAGE_CHANNEL_ORDER_RGX:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Construct ZE image desc from UR image format and desc.
 ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
                            const ur_image_desc_t *ImageDesc,
@@ -843,6 +853,14 @@ ur_result_t urBindlessImagesImageCopyExp(
   UR_CALL(ur2zeImageDesc(pSrcImageFormat, pSrcImageDesc, ZeImageDesc));
 
   bool UseCopyEngine = hQueue->useCopyEngine(/*PreferCopyEngine*/ true);
+  // Due to the limitation of the copy engine, disable usage of Copy Engine
+  // Given 3 channel image
+  if (Is3ChannelOrder(
+          ur_cast<ur_image_channel_order_t>(pSrcImageFormat->channelOrder)) ||
+      Is3ChannelOrder(
+          ur_cast<ur_image_channel_order_t>(pDstImageFormat->channelOrder))) {
+    UseCopyEngine = false;
+  }
 
   _ur_ze_event_list_t TmpWaitList;
   UR_CALL(TmpWaitList.createAndRetainUrZeEventList(

From 791562ac528316af28dead52a7ec0d02bbdf518e Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Mon, 16 Dec 2024 16:59:17 -0800
Subject: [PATCH 139/148] Update source/adapters/level_zero/image.cpp

Co-authored-by: Wenju He <wenju.he@intel.com>
---
 source/adapters/level_zero/image.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index b8bff77a2f..8c205f54c5 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -265,7 +265,7 @@ ur_result_t ze2urImageFormat(const ze_image_desc_t *ZeImageDesc,
   return UR_RESULT_SUCCESS;
 }
 
-bool Is3ChannelOrder(ur_image_channel_order_t ChannelOrder) {
+static bool Is3ChannelOrder(ur_image_channel_order_t ChannelOrder) {
   switch (ChannelOrder) {
   case UR_IMAGE_CHANNEL_ORDER_RGB:
   case UR_IMAGE_CHANNEL_ORDER_RGX:

From 39731363a8e00f110f6557aafd0d31dd7ea75bad Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 17 Dec 2024 08:11:31 -0800
Subject: [PATCH 140/148] [L0] Fix use after free with Module build strings

- Address Sanitizer HASAN fails with temporary strings even after last
  use after free patch. This change resolves all use after free of these
strings satifsying the address sanitzer checks with HASAN.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/program.cpp | 6 ++----
 source/adapters/level_zero/program.hpp | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index be8c366d6b..b5a64c3eda 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -452,11 +452,9 @@ ur_result_t urProgramLinkExp(
       // Build flags may be different for different devices, so handle them
       // here. Clear values of the previous device first.
       BuildFlagPtrs.clear();
-      std::vector<std::string> TemporaryOptionsStrings;
       for (uint32_t I = 0; I < count; I++) {
-        TemporaryOptionsStrings.push_back(
-            phPrograms[I]->getBuildOptions(ZeDevice));
-        BuildFlagPtrs.push_back(TemporaryOptionsStrings.back().c_str());
+        BuildFlagPtrs.push_back(
+            phPrograms[I]->getBuildOptions(ZeDevice).c_str());
       }
       ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
       if (count == 1)
diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp
index 4fe8c24acd..90b297fa40 100644
--- a/source/adapters/level_zero/program.hpp
+++ b/source/adapters/level_zero/program.hpp
@@ -169,7 +169,7 @@ struct ur_program_handle_t_ : _ur_object {
     DeviceDataMap[ZeDevice].BuildFlags += Options;
   }
 
-  std::string getBuildOptions(ze_device_handle_t ZeDevice) {
+  std::string &getBuildOptions(ze_device_handle_t ZeDevice) {
     return DeviceDataMap[ZeDevice].BuildFlags;
   }
 

From 3ab955f2f92005dfdc5736d2eee3ec01d0514560 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Mon, 25 Nov 2024 14:45:27 +0000
Subject: [PATCH 141/148] Enable adapter tests to run on all discovered
 adapters.

---
 .../native_cpu/ur_interface_loader.cpp        |  1 +
 .../adapter/adapter_adapter_native_cpu.match  |  5 --
 test/conformance/adapter/fixtures.h           | 53 ------------
 test/conformance/adapter/urAdapterGet.cpp     | 21 ++++-
 test/conformance/adapter/urAdapterGetInfo.cpp | 74 +++++++----------
 .../adapter/urAdapterGetLastError.cpp         | 20 +++--
 test/conformance/adapter/urAdapterRelease.cpp | 16 ++--
 test/conformance/adapter/urAdapterRetain.cpp  | 16 ++--
 test/conformance/platform/fixtures.h          |  8 ++
 test/conformance/source/environment.cpp       | 83 ++++++++++---------
 test/conformance/source/main.cpp              | 11 +--
 .../testing/include/uur/environment.h         | 14 +++-
 .../testing/include/uur/fixtures.h            | 15 ++++
 test/conformance/testing/include/uur/utils.h  |  2 +
 test/conformance/testing/source/utils.cpp     | 10 +++
 15 files changed, 167 insertions(+), 182 deletions(-)
 delete mode 100644 test/conformance/adapter/adapter_adapter_native_cpu.match
 delete mode 100644 test/conformance/adapter/fixtures.h

diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index a913aa4c4e..b1014c7459 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -207,6 +207,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
   pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo;
   pDdiTable->pfnAdapterRelease = urAdapterRelease;
   pDdiTable->pfnAdapterRetain = urAdapterRetain;
+  pDdiTable->pfnAdapterGetLastError = urAdapterGetLastError;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/test/conformance/adapter/adapter_adapter_native_cpu.match b/test/conformance/adapter/adapter_adapter_native_cpu.match
deleted file mode 100644
index ea65399d2f..0000000000
--- a/test/conformance/adapter/adapter_adapter_native_cpu.match
+++ /dev/null
@@ -1,5 +0,0 @@
-# These pass when the adapter is launched by the loader
-{{OPT}}urAdapterGetLastErrorTest.Success
-{{OPT}}urAdapterGetLastErrorTest.InvalidHandle
-{{OPT}}urAdapterGetLastErrorTest.InvalidMessagePtr
-{{OPT}}urAdapterGetLastErrorTest.InvalidErrorPtr
diff --git a/test/conformance/adapter/fixtures.h b/test/conformance/adapter/fixtures.h
deleted file mode 100644
index 31b2a2265d..0000000000
--- a/test/conformance/adapter/fixtures.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (C) 2022-2023 Intel Corporation
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-// See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <uur/fixtures.h>
-namespace uur {
-namespace runtime {
-
-struct urTest : ::testing::Test {
-
-    void SetUp() override {
-        ur_device_init_flags_t device_flags = 0;
-        ASSERT_SUCCESS(urLoaderConfigCreate(&loader_config));
-        ASSERT_SUCCESS(urLoaderConfigEnableLayer(loader_config,
-                                                 "UR_LAYER_FULL_VALIDATION"));
-        ASSERT_SUCCESS(urLoaderInit(device_flags, loader_config));
-    }
-
-    void TearDown() override {
-        if (loader_config) {
-            ASSERT_SUCCESS(urLoaderConfigRelease(loader_config));
-        }
-        ASSERT_SUCCESS(urLoaderTearDown());
-    }
-
-    ur_loader_config_handle_t loader_config = nullptr;
-};
-
-struct urAdapterTest : urTest {
-
-    void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urTest::SetUp());
-
-        uint32_t adapter_count;
-        ASSERT_SUCCESS(urAdapterGet(0, nullptr, &adapter_count));
-        ASSERT_GT(adapter_count, 0);
-        adapters.resize(adapter_count);
-        ASSERT_SUCCESS(urAdapterGet(adapter_count, adapters.data(), nullptr));
-    }
-
-    void TearDown() override {
-        for (auto adapter : adapters) {
-            ASSERT_SUCCESS(urAdapterRelease(adapter));
-        }
-        UUR_RETURN_ON_FATAL_FAILURE(urTest::TearDown());
-    }
-
-    std::vector<ur_adapter_handle_t> adapters;
-};
-
-} // namespace runtime
-} // namespace uur
diff --git a/test/conformance/adapter/urAdapterGet.cpp b/test/conformance/adapter/urAdapterGet.cpp
index 24d1a33057..f24dfe51ef 100644
--- a/test/conformance/adapter/urAdapterGet.cpp
+++ b/test/conformance/adapter/urAdapterGet.cpp
@@ -3,9 +3,26 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-using urAdapterGetTest = uur::runtime::urTest;
+struct urAdapterGetTest : ::testing::Test {
+    void SetUp() override {
+        ur_device_init_flags_t device_flags = 0;
+        ASSERT_SUCCESS(urLoaderConfigCreate(&loader_config));
+        ASSERT_SUCCESS(urLoaderConfigEnableLayer(loader_config,
+                                                 "UR_LAYER_FULL_VALIDATION"));
+        ASSERT_SUCCESS(urLoaderInit(device_flags, loader_config));
+    }
+
+    void TearDown() override {
+        if (loader_config) {
+            ASSERT_SUCCESS(urLoaderConfigRelease(loader_config));
+        }
+        ASSERT_SUCCESS(urLoaderTearDown());
+    }
+
+    ur_loader_config_handle_t loader_config = nullptr;
+};
 
 TEST_F(urAdapterGetTest, Success) {
     uint32_t adapter_count;
diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp
index 280948bc13..b8604ccd19 100644
--- a/test/conformance/adapter/urAdapterGetInfo.cpp
+++ b/test/conformance/adapter/urAdapterGetInfo.cpp
@@ -3,95 +3,83 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
 #include <cstring>
 
-struct urAdapterGetInfoTest : uur::runtime::urAdapterTest,
-                              ::testing::WithParamInterface<ur_adapter_info_t> {
-
-    void SetUp() {
-        UUR_RETURN_ON_FATAL_FAILURE(uur::runtime::urAdapterTest::SetUp());
-        adapter = adapters[0];
-    }
-
-    ur_adapter_handle_t adapter;
-};
-
-std::unordered_map<ur_adapter_info_t, size_t> adapter_info_size_map = {
-    {UR_ADAPTER_INFO_BACKEND, sizeof(ur_adapter_backend_t)},
-    {UR_ADAPTER_INFO_VERSION, sizeof(uint32_t)},
-    {UR_ADAPTER_INFO_REFERENCE_COUNT, sizeof(uint32_t)},
-};
-
-INSTANTIATE_TEST_SUITE_P(
-    urAdapterGetInfo, urAdapterGetInfoTest,
-    ::testing::Values(UR_ADAPTER_INFO_BACKEND, UR_ADAPTER_INFO_VERSION,
-                      UR_ADAPTER_INFO_REFERENCE_COUNT),
-    [](const ::testing::TestParamInfo<ur_adapter_info_t> &info) {
-        std::stringstream ss;
-        ss << info.param;
-        return ss.str();
-    });
-
-TEST_P(urAdapterGetInfoTest, Success) {
+using urAdapterGetInfoTest = uur::urAdapterTest;
+
+UUR_INSTANTIATE_ADAPTER_TEST_SUITE_P(urAdapterGetInfoTest);
+
+TEST_P(urAdapterGetInfoTest, Backend) {
+    auto info_type = UR_ADAPTER_INFO_BACKEND;
     size_t size = 0;
-    ur_adapter_info_t info_type = GetParam();
     ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
         urAdapterGetInfo(adapter, info_type, 0, nullptr, &size), info_type);
     ASSERT_NE(size, 0);
 
-    if (const auto expected_size = adapter_info_size_map.find(info_type);
-        expected_size != adapter_info_size_map.end()) {
-        ASSERT_EQ(expected_size->second, size);
-    }
+    ASSERT_EQ(size, sizeof(ur_adapter_backend_t));
 
     std::vector<char> info_data(size);
     ASSERT_SUCCESS(
         urAdapterGetInfo(adapter, info_type, size, info_data.data(), nullptr));
 }
 
+TEST_P(urAdapterGetInfoTest, ReferenceCount) {
+    auto info_type = UR_ADAPTER_INFO_REFERENCE_COUNT;
+    size_t size = 0;
+    ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
+        urAdapterGetInfo(adapter, info_type, 0, nullptr, &size), info_type);
+    ASSERT_EQ(size, sizeof(uint32_t));
+
+    uint32_t reference_count = 0;
+    ASSERT_SUCCESS(
+        urAdapterGetInfo(adapter, info_type, size, &reference_count, nullptr));
+    ASSERT_GE(reference_count, 0);
+}
+
 TEST_P(urAdapterGetInfoTest, InvalidNullHandleAdapter) {
     size_t size = 0;
-    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
-                     urAdapterGetInfo(nullptr, GetParam(), 0, nullptr, &size));
+    ASSERT_EQ_RESULT(
+        UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+        urAdapterGetInfo(nullptr, UR_ADAPTER_INFO_BACKEND, 0, nullptr, &size));
 }
 
-TEST_F(urAdapterGetInfoTest, InvalidEnumerationAdapterInfoType) {
+TEST_P(urAdapterGetInfoTest, InvalidEnumerationAdapterInfoType) {
     size_t size = 0;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION,
                      urAdapterGetInfo(adapter, UR_ADAPTER_INFO_FORCE_UINT32, 0,
                                       nullptr, &size));
 }
 
-TEST_F(urAdapterGetInfoTest, InvalidSizeZero) {
+TEST_P(urAdapterGetInfoTest, InvalidSizeZero) {
     ur_adapter_backend_t backend;
     ASSERT_EQ_RESULT(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND, 0,
                                       &backend, nullptr),
                      UR_RESULT_ERROR_INVALID_SIZE);
 }
 
-TEST_F(urAdapterGetInfoTest, InvalidSizeSmall) {
+TEST_P(urAdapterGetInfoTest, InvalidSizeSmall) {
     ur_adapter_backend_t backend;
     ASSERT_EQ_RESULT(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND,
                                       sizeof(backend) - 1, &backend, nullptr),
                      UR_RESULT_ERROR_INVALID_SIZE);
 }
 
-TEST_F(urAdapterGetInfoTest, InvalidNullPointerPropValue) {
+TEST_P(urAdapterGetInfoTest, InvalidNullPointerPropValue) {
     ur_adapter_backend_t backend;
     ASSERT_EQ_RESULT(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND,
                                       sizeof(backend), nullptr, nullptr),
                      UR_RESULT_ERROR_INVALID_NULL_POINTER);
 }
 
-TEST_F(urAdapterGetInfoTest, InvalidNullPointerPropSizeRet) {
+TEST_P(urAdapterGetInfoTest, InvalidNullPointerPropSizeRet) {
     ASSERT_EQ_RESULT(
         urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND, 0, nullptr, nullptr),
         UR_RESULT_ERROR_INVALID_NULL_POINTER);
 }
 
-TEST_F(urAdapterGetInfoTest, ReferenceCountNotZero) {
+TEST_P(urAdapterGetInfoTest, ReferenceCountNotZero) {
     uint32_t referenceCount = 0;
 
     ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT,
@@ -100,7 +88,7 @@ TEST_F(urAdapterGetInfoTest, ReferenceCountNotZero) {
     ASSERT_GT(referenceCount, 0);
 }
 
-TEST_F(urAdapterGetInfoTest, ValidAdapterBackend) {
+TEST_P(urAdapterGetInfoTest, ValidAdapterBackend) {
     ur_adapter_backend_t backend;
     ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND,
                                     sizeof(backend), &backend, nullptr));
diff --git a/test/conformance/adapter/urAdapterGetLastError.cpp b/test/conformance/adapter/urAdapterGetLastError.cpp
index a82b0664f0..39728f21b8 100644
--- a/test/conformance/adapter/urAdapterGetLastError.cpp
+++ b/test/conformance/adapter/urAdapterGetLastError.cpp
@@ -3,33 +3,35 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-struct urAdapterGetLastErrorTest : uur::runtime::urAdapterTest {
+struct urAdapterGetLastErrorTest : uur::urAdapterTest {
     int32_t error;
     const char *message = nullptr;
 };
 
-TEST_F(urAdapterGetLastErrorTest, Success) {
+UUR_INSTANTIATE_ADAPTER_TEST_SUITE_P(urAdapterGetLastErrorTest);
+
+TEST_P(urAdapterGetLastErrorTest, Success) {
     // We can't reliably generate a UR_RESULT_ERROR_ADAPTER_SPECIFIC error to
     // test the full functionality of this entry point, so instead do a minimal
     // smoke test and check that the call returns successfully, even if no
     // actual error was set.
     ASSERT_EQ_RESULT(UR_RESULT_SUCCESS,
-                     urAdapterGetLastError(adapters[0], &message, &error));
+                     urAdapterGetLastError(adapter, &message, &error));
 }
 
-TEST_F(urAdapterGetLastErrorTest, InvalidHandle) {
+TEST_P(urAdapterGetLastErrorTest, InvalidHandle) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urAdapterGetLastError(nullptr, &message, &error));
 }
 
-TEST_F(urAdapterGetLastErrorTest, InvalidMessagePtr) {
+TEST_P(urAdapterGetLastErrorTest, InvalidMessagePtr) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
-                     urAdapterGetLastError(adapters[0], nullptr, &error));
+                     urAdapterGetLastError(adapter, nullptr, &error));
 }
 
-TEST_F(urAdapterGetLastErrorTest, InvalidErrorPtr) {
+TEST_P(urAdapterGetLastErrorTest, InvalidErrorPtr) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
-                     urAdapterGetLastError(adapters[0], &message, nullptr));
+                     urAdapterGetLastError(adapter, &message, nullptr));
 }
diff --git a/test/conformance/adapter/urAdapterRelease.cpp b/test/conformance/adapter/urAdapterRelease.cpp
index 0b28287aa7..ed7e4bf132 100644
--- a/test/conformance/adapter/urAdapterRelease.cpp
+++ b/test/conformance/adapter/urAdapterRelease.cpp
@@ -3,18 +3,12 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-struct urAdapterReleaseTest : uur::runtime::urAdapterTest {
-    void SetUp() {
-        UUR_RETURN_ON_FATAL_FAILURE(uur::runtime::urAdapterTest::SetUp());
-        adapter = adapters[0];
-    }
+using urAdapterReleaseTest = uur::urAdapterTest;
+UUR_INSTANTIATE_ADAPTER_TEST_SUITE_P(urAdapterReleaseTest);
 
-    ur_adapter_handle_t adapter;
-};
-
-TEST_F(urAdapterReleaseTest, Success) {
+TEST_P(urAdapterReleaseTest, Success) {
     uint32_t referenceCountBefore = 0;
     ASSERT_SUCCESS(urAdapterRetain(adapter));
 
@@ -31,7 +25,7 @@ TEST_F(urAdapterReleaseTest, Success) {
     ASSERT_LE(referenceCountAfter, referenceCountBefore);
 }
 
-TEST_F(urAdapterReleaseTest, InvalidNullHandleAdapter) {
+TEST_P(urAdapterReleaseTest, InvalidNullHandleAdapter) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urAdapterRelease(nullptr));
 }
diff --git a/test/conformance/adapter/urAdapterRetain.cpp b/test/conformance/adapter/urAdapterRetain.cpp
index 86967b983b..a6041223af 100644
--- a/test/conformance/adapter/urAdapterRetain.cpp
+++ b/test/conformance/adapter/urAdapterRetain.cpp
@@ -3,18 +3,12 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-struct urAdapterRetainTest : uur::runtime::urAdapterTest {
-    void SetUp() {
-        UUR_RETURN_ON_FATAL_FAILURE(uur::runtime::urAdapterTest::SetUp());
-        adapter = adapters[0];
-    }
+using urAdapterRetainTest = uur::urAdapterTest;
+UUR_INSTANTIATE_ADAPTER_TEST_SUITE_P(urAdapterRetainTest);
 
-    ur_adapter_handle_t adapter;
-};
-
-TEST_F(urAdapterRetainTest, Success) {
+TEST_P(urAdapterRetainTest, Success) {
     uint32_t referenceCountBefore = 0;
 
     ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT,
@@ -30,7 +24,7 @@ TEST_F(urAdapterRetainTest, Success) {
     ASSERT_GT(referenceCountAfter, referenceCountBefore);
 }
 
-TEST_F(urAdapterRetainTest, InvalidNullHandleAdapter) {
+TEST_P(urAdapterRetainTest, InvalidNullHandleAdapter) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urAdapterRetain(nullptr));
 }
diff --git a/test/conformance/platform/fixtures.h b/test/conformance/platform/fixtures.h
index b294e7031a..a11426b25c 100644
--- a/test/conformance/platform/fixtures.h
+++ b/test/conformance/platform/fixtures.h
@@ -68,6 +68,14 @@ struct urPlatformTest : urPlatformsTest {
     ur_platform_handle_t platform;
 };
 
+#define UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(FIXTURE)                         \
+    INSTANTIATE_TEST_SUITE_P(                                                  \
+        , FIXTURE,                                                             \
+        ::testing::ValuesIn(uur::DevicesEnvironment::instance->devices),       \
+        [](const ::testing::TestParamInfo<ur_device_handle_t> &info) {         \
+            return uur::GetPlatformAndDeviceName(info.param);                  \
+        })
+
 } // namespace platform
 } // namespace uur
 
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index cc64ab11ea..e1d1951616 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -24,6 +24,49 @@ namespace uur {
 
 constexpr char ERROR_NO_ADAPTER[] = "Could not load adapter";
 
+AdapterEnvironment *AdapterEnvironment::instance = nullptr;
+
+AdapterEnvironment::AdapterEnvironment() {
+    instance = this;
+
+    ur_loader_config_handle_t config;
+    if (urLoaderConfigCreate(&config) == UR_RESULT_SUCCESS) {
+        if (urLoaderConfigEnableLayer(config, "UR_LAYER_FULL_VALIDATION") !=
+            UR_RESULT_SUCCESS) {
+            urLoaderConfigRelease(config);
+            error = "Failed to enable validation layer";
+            return;
+        }
+    } else {
+        error = "Failed to create loader config handle";
+        return;
+    }
+
+    ur_device_init_flags_t device_flags = 0;
+    auto initResult = urLoaderInit(device_flags, config);
+    auto configReleaseResult = urLoaderConfigRelease(config);
+    switch (initResult) {
+    case UR_RESULT_SUCCESS:
+        break;
+    case UR_RESULT_ERROR_UNINITIALIZED:
+        error = ERROR_NO_ADAPTER;
+        return;
+    default:
+        error = "urLoaderInit() failed";
+        return;
+    }
+
+    if (configReleaseResult) {
+        error = "Failed to destroy loader config handle";
+        return;
+    }
+
+    uint32_t adapter_count = 0;
+    urAdapterGet(0, nullptr, &adapter_count);
+    adapters.resize(adapter_count);
+    urAdapterGet(adapter_count, adapters.data(), nullptr);
+}
+
 PlatformEnvironment *PlatformEnvironment::instance = nullptr;
 
 constexpr std::pair<const char *, ur_platform_backend_t> backends[] = {
@@ -75,54 +118,18 @@ std::ostream &operator<<(std::ostream &out,
 }
 
 uur::PlatformEnvironment::PlatformEnvironment(int argc, char **argv)
-    : platform_options{parsePlatformOptions(argc, argv)} {
+    : AdapterEnvironment(), platform_options{parsePlatformOptions(argc, argv)} {
     instance = this;
+
     // Check for errors from parsing platform options
     if (!error.empty()) {
         return;
     }
 
-    ur_loader_config_handle_t config;
-    if (urLoaderConfigCreate(&config) == UR_RESULT_SUCCESS) {
-        if (urLoaderConfigEnableLayer(config, "UR_LAYER_FULL_VALIDATION") !=
-            UR_RESULT_SUCCESS) {
-            urLoaderConfigRelease(config);
-            error = "Failed to enable validation layer";
-            return;
-        }
-    } else {
-        error = "Failed to create loader config handle";
-        return;
-    }
-
-    ur_device_init_flags_t device_flags = 0;
-    auto initResult = urLoaderInit(device_flags, config);
-    auto configReleaseResult = urLoaderConfigRelease(config);
-    switch (initResult) {
-    case UR_RESULT_SUCCESS:
-        break;
-    case UR_RESULT_ERROR_UNINITIALIZED:
-        error = ERROR_NO_ADAPTER;
-        return;
-    default:
-        error = "urLoaderInit() failed";
-        return;
-    }
-
-    if (configReleaseResult) {
-        error = "Failed to destroy loader config handle";
-        return;
-    }
-
     selectPlatformFromOptions();
 }
 
 void uur::PlatformEnvironment::selectPlatformFromOptions() {
-    uint32_t adapter_count = 0;
-    urAdapterGet(0, nullptr, &adapter_count);
-    adapters.resize(adapter_count);
-    urAdapterGet(adapter_count, adapters.data(), nullptr);
-
     struct platform_info {
         ur_adapter_handle_t adapter;
         ur_platform_handle_t platform;
diff --git a/test/conformance/source/main.cpp b/test/conformance/source/main.cpp
index a8f981ebef..1a1e3b0380 100644
--- a/test/conformance/source/main.cpp
+++ b/test/conformance/source/main.cpp
@@ -9,17 +9,14 @@ int main(int argc, char **argv) {
 #ifdef KERNELS_ENVIRONMENT
     auto *environment =
         new uur::KernelsEnvironment(argc, argv, KERNELS_DEFAULT_DIR);
-#endif
-#ifdef DEVICES_ENVIRONMENT
+#elif DEVICES_ENVIRONMENT
     auto *environment = new uur::DevicesEnvironment(argc, argv);
-#endif
-#ifdef PLATFORM_ENVIRONMENT
+#elif PLATFORM_ENVIRONMENT
     auto *environment = new uur::PlatformEnvironment(argc, argv);
+#else
+    auto *environment = new uur::AdapterEnvironment();
 #endif
     ::testing::InitGoogleTest(&argc, argv);
-#if defined(DEVICES_ENVIRONMENT) || defined(PLATFORM_ENVIRONMENT) ||           \
-    defined(KERNELS_ENVIRONMENT)
     ::testing::AddGlobalTestEnvironment(environment);
-#endif
     return RUN_ALL_TESTS();
 }
diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h
index bba4c583c8..397858333f 100644
--- a/test/conformance/testing/include/uur/environment.h
+++ b/test/conformance/testing/include/uur/environment.h
@@ -14,7 +14,17 @@
 #include <ur_api.h>
 namespace uur {
 
-struct PlatformEnvironment : ::testing::Environment {
+struct AdapterEnvironment : ::testing::Environment {
+
+    AdapterEnvironment();
+    virtual ~AdapterEnvironment() override = default;
+
+    std::string error{};
+    std::vector<ur_adapter_handle_t> adapters{};
+    static AdapterEnvironment *instance;
+};
+
+struct PlatformEnvironment : AdapterEnvironment {
 
     struct PlatformOptions {
         std::string platform_name;
@@ -31,9 +41,7 @@ struct PlatformEnvironment : ::testing::Environment {
     void selectPlatformFromOptions();
     PlatformOptions parsePlatformOptions(int argc, char **argv);
 
-    std::string error{};
     PlatformOptions platform_options;
-    std::vector<ur_adapter_handle_t> adapters{};
     ur_adapter_handle_t adapter = nullptr;
     ur_platform_handle_t platform = nullptr;
     static PlatformEnvironment *instance;
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index e5c5d39591..5435b20a42 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -35,6 +35,13 @@
 
 namespace uur {
 
+struct urAdapterTest : ::testing::Test,
+                       ::testing::WithParamInterface<ur_adapter_handle_t> {
+    void SetUp() override { adapter = GetParam(); }
+
+    ur_adapter_handle_t adapter;
+};
+
 struct urPlatformTest : ::testing::Test {
     void SetUp() override {
         platform = uur::PlatformEnvironment::instance->platform;
@@ -107,6 +114,14 @@ struct urDeviceTest : urPlatformTest,
 };
 } // namespace uur
 
+#define UUR_INSTANTIATE_ADAPTER_TEST_SUITE_P(FIXTURE)                          \
+    INSTANTIATE_TEST_SUITE_P(                                                  \
+        , FIXTURE,                                                             \
+        ::testing::ValuesIn(uur::AdapterEnvironment::instance->adapters),      \
+        [](const ::testing::TestParamInfo<ur_adapter_handle_t> &info) {        \
+            return uur::GetAdapterBackendName(info.param);                     \
+        })
+
 #define UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(FIXTURE)                           \
     INSTANTIATE_TEST_SUITE_P(                                                  \
         , FIXTURE,                                                             \
diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h
index 8548b12d11..2415e580c5 100644
--- a/test/conformance/testing/include/uur/utils.h
+++ b/test/conformance/testing/include/uur/utils.h
@@ -180,6 +180,8 @@ ur_result_t GetObjectReferenceCount(T object, uint32_t &out_ref_count) {
     return UR_RESULT_ERROR_INVALID_VALUE;
 }
 
+std::string GetAdapterBackendName(ur_adapter_handle_t hAdapter);
+
 inline std::string GetPlatformName(ur_platform_handle_t hPlatform) {
     std::string platform_name;
     GetPlatformInfo<std::string>(hPlatform, UR_PLATFORM_INFO_NAME,
diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp
index 0ab058bc30..6931d3c761 100644
--- a/test/conformance/testing/source/utils.cpp
+++ b/test/conformance/testing/source/utils.cpp
@@ -3,7 +3,9 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <sstream>
 #include <string_view>
+#include <ur_print.hpp>
 #include <uur/utils.h>
 
 namespace uur {
@@ -689,4 +691,12 @@ ur_result_t MakeUSMAllocationByType(ur_usm_type_t type,
     }
 }
 
+std::string GetAdapterBackendName(ur_adapter_handle_t hAdapter) {
+    ur_adapter_backend_t backend = UR_ADAPTER_BACKEND_UNKNOWN;
+    urAdapterGetInfo(hAdapter, UR_ADAPTER_INFO_BACKEND, sizeof(backend),
+                     &backend, nullptr);
+    std::stringstream ss;
+    ss << backend;
+    return ss.str();
+}
 } // namespace uur

From 16b6d4698d1c63da6cff928d6fd3d01f8dcadeac Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Mon, 18 Nov 2024 23:12:23 +0000
Subject: [PATCH 142/148] [CTS] skip image tests if images are not supported

---
 .../memory/memory_adapter_level_zero_v2.match     | 15 ---------------
 .../memory/memory_adapter_native_cpu.match        |  1 -
 .../urMemImageCreateWithImageFormatParam.cpp      |  4 ++++
 .../memory/urMemImageCreateWithNativeHandle.cpp   |  2 +-
 test/conformance/testing/include/uur/fixtures.h   | 11 ++++++-----
 5 files changed, 11 insertions(+), 22 deletions(-)
 delete mode 100644 test/conformance/memory/memory_adapter_level_zero_v2.match

diff --git a/test/conformance/memory/memory_adapter_level_zero_v2.match b/test/conformance/memory/memory_adapter_level_zero_v2.match
deleted file mode 100644
index ec09c7b5ef..0000000000
--- a/test/conformance/memory/memory_adapter_level_zero_v2.match
+++ /dev/null
@@ -1,15 +0,0 @@
-{{OPT}}urMemBufferPartitionWithFlagsTest.Success/*
-{{OPT}}urMemBufferPartitionTest.InvalidValueCreateType/*
-{{OPT}}urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/*
-{{OPT}}urMemGetInfoImageTest.Success/*
-{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/*
-{{OPT}}urMemImageGetInfoTest.Success/*
-{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/*
-{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/*
-{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/*
-{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/*
-{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/*
-{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/*
-{{OPT}}urMemImageCreateWithNativeHandleTest.Success/*
-{{OPT}}urMemImageCreateWithNativeHandleTest.InvalidNullHandle/*
-{{OPT}}urMemImageCreateWithNativeHandleTest.InvalidNullPointer/*
diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match
index aafd22075c..4465185bed 100644
--- a/test/conformance/memory/memory_adapter_native_cpu.match
+++ b/test/conformance/memory/memory_adapter_native_cpu.match
@@ -4,7 +4,6 @@ urMemBufferPartitionTest.InvalidValueCreateType/*
 urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/*
 urMemGetInfoTestWithParam.Success/*
 urMemGetInfoTest.InvalidSizeSmall/*
-urMemImageCreateTestWithImageFormatParam.Success/*
 urMemReleaseTest.Success/*
 urMemReleaseTest.CheckReferenceCount/*
 urMemRetainTest.Success/*
diff --git a/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp
index c305f58f00..b72b5a9425 100644
--- a/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp
+++ b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp
@@ -110,6 +110,10 @@ TEST_P(urMemImageCreateTestWithImageFormatParam, Success) {
         }
     }
 
+    if (res == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP() << "urMemImageCreate not supported";
+    }
+
     if (!is_primary_image_format &&
         res == UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) {
         GTEST_SKIP();
diff --git a/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp b/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
index c33cc814a3..bf15fc16e3 100644
--- a/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
+++ b/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
@@ -10,7 +10,7 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageCreateWithNativeHandleTest);
 
 TEST_P(urMemImageCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_handle = 0;
-    ASSERT_SUCCESS(urMemGetNativeHandle(image, device, &native_handle));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urMemGetNativeHandle(image, device, &native_handle));
 
     ur_mem_handle_t mem = nullptr;
     ASSERT_SUCCESS(urMemImageCreateWithNativeHandle(
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index e5c5d39591..a7072c9599 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -209,9 +209,9 @@ struct urMemImageTest : urContextTest {
         if (!imageSupported) {
             GTEST_SKIP();
         }
-        ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
-                                        &image_format, &image_desc, nullptr,
-                                        &image));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format,
+                             &image_desc, nullptr, &image));
     }
 
     void TearDown() override {
@@ -323,8 +323,9 @@ template <class T> struct urMemImageTestWithParam : urContextTestWithParam<T> {
         if (!imageSupported) {
             GTEST_SKIP();
         }
-        ASSERT_SUCCESS(urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE,
-                                        &format, &desc, nullptr, &image));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE, &format,
+                             &desc, nullptr, &image));
         ASSERT_NE(nullptr, image);
     }
 

From 546652dda7955aabd8dfb626ebbe46f170366d31 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 11 Dec 2024 21:45:56 +0000
Subject: [PATCH 143/148] [NativeCPU] implement missing query for HOST_PIPE
 support

and skip pipe and fill2d tests if the feature is not supported
---
 source/adapters/native_cpu/device.cpp                 |  3 +++
 .../device/device_adapter_native_cpu.match            |  1 -
 .../enqueue/enqueue_adapter_level_zero.match          |  1 -
 .../enqueue/enqueue_adapter_level_zero_v2.match       |  1 -
 .../enqueue/enqueue_adapter_native_cpu.match          | 10 ----------
 test/conformance/enqueue/urEnqueueUSMFill2D.cpp       | 11 ++++++++---
 .../memory/urMemImageCreateWithNativeHandle.cpp       |  3 ++-
 test/conformance/testing/include/uur/fixtures.h       | 10 ++++++----
 8 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 48cdf9a404..b00892d040 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -420,6 +420,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP:
     return ReturnValue(false);
 
+  case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED:
+    return ReturnValue(ur_bool_t{false});
+
   case UR_DEVICE_INFO_USM_POOL_SUPPORT:
     return ReturnValue(false);
 
diff --git a/test/conformance/device/device_adapter_native_cpu.match b/test/conformance/device/device_adapter_native_cpu.match
index da07da18dc..2764ad4fd1 100644
--- a/test/conformance/device/device_adapter_native_cpu.match
+++ b/test/conformance/device/device_adapter_native_cpu.match
@@ -11,7 +11,6 @@ urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_WORK_GROUPS_3D
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match
index fdfbe01690..286cafc7fa 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match
@@ -54,7 +54,6 @@
 {{OPT}}urEnqueueMemImageWriteTest.InvalidRegion3D/*
 {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/*
 urEnqueueUSMAdviseTest.InvalidSizeTooLarge/*
-urEnqueueUSMFill2DNegativeTest.OutOfBounds/*
 {{OPT}}urEnqueueUSMMemcpyTest.Blocking/*
 {{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/*
 {{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/*
diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
index f0af10a448..58a6ec4116 100644
--- a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
+++ b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match
@@ -68,7 +68,6 @@ urEnqueueKernelLaunchKernelWgSizeTest.Success/*
 {{OPT}}urEnqueueMemImageWriteTest.InvalidRegion2D/*
 {{OPT}}urEnqueueMemImageWriteTest.InvalidRegion3D/*
 {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/*
-urEnqueueUSMFill2DNegativeTest.OutOfBounds/*
 urEnqueueUSMAdviseTest.InvalidSizeTooLarge/*
 urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/*
 {{OPT}}urEnqueueTimestampRecordingExpTest.SuccessBlocking/*
diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
index 18abf6abfe..35b9df84de 100644
--- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match
+++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
@@ -139,16 +139,6 @@ urEnqueueEventsWaitWithBarrierOrderingTest.SuccessNonEventDependencies/*_
 {{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeZero/*
 {{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/*
 {{OPT}}urEnqueueUSMPrefetchTest.InvalidEventWaitList/*
-{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleQueue/*
-{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleProgram/*
-{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/*
-{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/*
-{{OPT}}urEnqueueReadHostPipeTest.InvalidEventWaitList/*
-{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/*
-{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/*
-{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/*
-{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/*
-{{OPT}}urEnqueueWriteHostPipeTest.InvalidEventWaitList/*
 urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/*
 urEnqueueKernelLaunchIncrementMultiDeviceTest.Success/*
 urEnqueueKernelLaunchIncrementTest.Success/*
diff --git a/test/conformance/enqueue/urEnqueueUSMFill2D.cpp b/test/conformance/enqueue/urEnqueueUSMFill2D.cpp
index 29123b57bd..e5d3186b81 100644
--- a/test/conformance/enqueue/urEnqueueUSMFill2D.cpp
+++ b/test/conformance/enqueue/urEnqueueUSMFill2D.cpp
@@ -134,9 +134,9 @@ TEST_P(urEnqueueUSMFill2DTestWithParam, Success) {
 
     ur_event_handle_t event = nullptr;
 
-    ASSERT_SUCCESS(urEnqueueUSMFill2D(queue, ptr, pitch, pattern_size,
-                                      pattern.data(), width, height, 0, nullptr,
-                                      &event));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urEnqueueUSMFill2D(queue, ptr, pitch, pattern_size, pattern.data(),
+                           width, height, 0, nullptr, &event));
     EXPECT_SUCCESS(urQueueFlush(queue));
 
     ASSERT_SUCCESS(urEventWait(1, &event));
@@ -161,6 +161,11 @@ struct urEnqueueUSMFill2DNegativeTest : uur::urQueueTest {
 
         ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
                                         allocation_size, &ptr));
+
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urEnqueueUSMFill2D(queue, ptr, pitch, pattern_size, pattern.data(),
+                               width, height, 0, nullptr, nullptr));
+        ASSERT_SUCCESS(urQueueFinish(queue));
     }
 
     void TearDown() override {
diff --git a/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp b/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
index bf15fc16e3..a881af0752 100644
--- a/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
+++ b/test/conformance/memory/urMemImageCreateWithNativeHandle.cpp
@@ -10,7 +10,8 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageCreateWithNativeHandleTest);
 
 TEST_P(urMemImageCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_handle = 0;
-    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urMemGetNativeHandle(image, device, &native_handle));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urMemGetNativeHandle(image, device, &native_handle));
 
     ur_mem_handle_t mem = nullptr;
     ASSERT_SUCCESS(urMemImageCreateWithNativeHandle(
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index a7072c9599..584ce8b9ca 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -374,10 +374,6 @@ struct urQueueTest : urContextTest {
 struct urHostPipeTest : urQueueTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
-        UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource("foo", il_binary));
-        ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
-            platform, context, device, *il_binary, nullptr, &program));
 
         size_t size = 0;
         ASSERT_SUCCESS(urDeviceGetInfo(
@@ -385,6 +381,7 @@ struct urHostPipeTest : urQueueTest {
             &size));
         ASSERT_NE(size, 0);
         ASSERT_EQ(sizeof(ur_bool_t), size);
+
         void *info_data = alloca(size);
         ASSERT_SUCCESS(urDeviceGetInfo(
             device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, size,
@@ -396,6 +393,11 @@ struct urHostPipeTest : urQueueTest {
         if (!supported) {
             GTEST_SKIP() << "Host pipe read/write is not supported.";
         }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::KernelsEnvironment::instance->LoadSource("foo", il_binary));
+        ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
+            platform, context, device, *il_binary, nullptr, &program));
     }
 
     void TearDown() override {

From 62380a96a1f7e24a0543e93cb4f2cf9949f1b789 Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Wed, 18 Dec 2024 04:01:36 +0100
Subject: [PATCH 144/148] [DevMSAN] Don't return error if try to get
 "__MsanKernelMetadata" failed

This is a same fix as #2412, we will refine the logic later.
---
 source/loader/layers/sanitizer/msan/msan_interceptor.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
index 30a2e07359..b9fd9d1ed6 100644
--- a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
@@ -175,10 +175,7 @@ ur_result_t MsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
                 Device, Program, kSPIR_MsanSpirKernelMetadata, &MetadataSize,
                 &MetadataPtr);
         if (Result != UR_RESULT_SUCCESS) {
-            getContext()->logger.error(
-                "Can't get the pointer of <{}> under device {}: {}",
-                kSPIR_MsanSpirKernelMetadata, (void *)Device, Result);
-            return Result;
+            continue;
         }
 
         const uint64_t NumOfSpirKernel = MetadataSize / sizeof(SpirKernelInfo);

From 56a3c841f75e1d12310e9ed13c7a2fcf7398712b Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 17 Dec 2024 16:51:31 -0800
Subject: [PATCH 145/148] [L0] Fix Event Memory Leak due to no destroy on
 delete

- Given Internal Event that is not cached we must destroy the ze event
  at handle destroy. This can only occur if the associated queue does
not have discard events enabled.
- during context release and when event caching is disabled, the destroy
  sets the event handle to nullptr to avoid double cleanup.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/context.cpp |  1 +
 source/adapters/level_zero/event.cpp   | 21 +++++++++++++++++++++
 source/adapters/level_zero/event.hpp   |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index faa16d48dd..169c8ec097 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -422,6 +422,7 @@ ur_result_t ur_context_handle_t_::finalize() {
     for (auto &EventCache : EventCaches) {
       for (auto &Event : EventCache) {
         auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
+        Event->ZeEvent = nullptr;
         // Gracefully handle the case that L0 was already unloaded.
         if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
           return ze2urResult(ZeResult);
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 649319867d..3eb5d6fbcf 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -1051,6 +1051,26 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
   return UR_RESULT_SUCCESS;
 }
 
+/**
+ * @brief Destructor for the ur_event_handle_t_ class.
+ *
+ * This destructor is responsible for cleaning up the event handle when the
+ * object is destroyed. It checks if the event (`ZeEvent`) is valid and if the
+ * event has been completed (`Completed`). If both conditions are met, it
+ * further checks if the associated queue (`UrQueue`) is valid and if it is not
+ * set to discard events. If all conditions are satisfied, it calls
+ * `zeEventDestroy` to destroy the event.
+ *
+ * This ensures that resources are properly released and avoids potential memory
+ * leaks or resource mismanagement.
+ */
+ur_event_handle_t_::~ur_event_handle_t_() {
+  if (this->ZeEvent && this->Completed) {
+    if (this->UrQueue && !this->UrQueue->isDiscardEvents())
+      ZE_CALL_NOCHECK(zeEventDestroy, (this->ZeEvent));
+  }
+}
+
 ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   if (!Event->RefCount.decrementAndTest())
     return UR_RESULT_SUCCESS;
@@ -1073,6 +1093,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   if (Event->OwnNativeHandle) {
     if (DisableEventsCaching) {
       auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
+      Event->ZeEvent = nullptr;
       // Gracefully handle the case that L0 was already unloaded.
       if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
         return ze2urResult(ZeResult);
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index de018e7060..efae32f361 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -156,6 +156,8 @@ struct ur_event_handle_t_ : _ur_object {
                reinterpret_cast<ur_event_handle_t_ *>(HostVisibleEvent));
   }
 
+  ~ur_event_handle_t_();
+
   // Provide direct access to Context, instead of going via queue.
   // Not every PI event has a queue, and we need a handle to Context
   // to get to event pool related information.

From 89d6afba6d55ae4c746b4743111f9eb70e0b44b4 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Thu, 19 Dec 2024 16:14:39 +0000
Subject: [PATCH 146/148] Remove stale link, update third-party tools docs

Fixes #2487
---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 459b75398a..2b555702b9 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,14 @@ history to avoid pulling potentially breaking changes from the `main` branch.
 
 ## Third-Party tools
 
-Tools can be acquired via instructions in [third_party](/third_party/README.md).
+The recommended method to install the third-party tools is using a Python
+virtual environment, for example:
+
+```bash
+$ python -m venv .venv
+$ source .venv/bin/activate
+$ pip install -r third_party/requirements.txt
+```
 
 ## Building
 

From 52574d8fcfb1df4679358e40482e14cf147b343c Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Mon, 25 Nov 2024 14:45:27 +0000
Subject: [PATCH 147/148] Enable platform CTS tests to run on all available
 platforms.

---
 .../cuda/urDeviceCreateWithNativeHandle.cpp   |   2 +-
 test/conformance/device/urDeviceGet.cpp       |  75 +++++-----
 .../device/urDeviceGetSelected.cpp            |  41 +++---
 .../urMemBufferMigrateAcrossDevices.cpp       |   9 +-
 test/conformance/platform/CMakeLists.txt      |   2 +-
 test/conformance/platform/fixtures.h          |  88 ++++--------
 .../urPlatformCreateWithNativeHandle.cpp      | 133 +++++++++---------
 test/conformance/platform/urPlatformGet.cpp   |   7 +-
 .../platform/urPlatformGetApiVersion.cpp      |  11 +-
 .../platform/urPlatformGetBackendOption.cpp   |  30 ++--
 .../platform/urPlatformGetInfo.cpp            |  46 +++---
 .../platform/urPlatformGetNativeHandle.cpp    |  25 ++--
 test/conformance/source/environment.cpp       |   1 +
 .../testing/include/uur/environment.h         |   3 +
 .../testing/include/uur/fixtures.h            |  47 +++++--
 test/conformance/testing/include/uur/utils.h  |   9 ++
 16 files changed, 267 insertions(+), 262 deletions(-)

diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
index e4ac022507..615e6e95dd 100644
--- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
+++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp
@@ -5,7 +5,7 @@
 
 #include "fixtures.h"
 
-using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest;
+using urCudaDeviceCreateWithNativeHandle = uur::urSelectedPlatformTest;
 
 TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
     // get a device from cuda
diff --git a/test/conformance/device/urDeviceGet.cpp b/test/conformance/device/urDeviceGet.cpp
index 5ce4c45906..f6ec9d82aa 100644
--- a/test/conformance/device/urDeviceGet.cpp
+++ b/test/conformance/device/urDeviceGet.cpp
@@ -6,8 +6,9 @@
 #include <uur/fixtures.h>
 
 using urDeviceGetTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urDeviceGetTest);
 
-TEST_F(urDeviceGetTest, Success) {
+TEST_P(urDeviceGetTest, Success) {
     uint32_t count = 0;
     ASSERT_SUCCESS(
         urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
@@ -20,7 +21,7 @@ TEST_F(urDeviceGetTest, Success) {
     }
 }
 
-TEST_F(urDeviceGetTest, SuccessSubsetOfDevices) {
+TEST_P(urDeviceGetTest, SuccessSubsetOfDevices) {
     uint32_t count;
     ASSERT_SUCCESS(
         urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
@@ -35,6 +36,41 @@ TEST_F(urDeviceGetTest, SuccessSubsetOfDevices) {
     }
 }
 
+TEST_P(urDeviceGetTest, InvalidNullHandlePlatform) {
+    uint32_t count;
+    ASSERT_EQ_RESULT(
+        UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+        urDeviceGet(nullptr, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
+}
+
+TEST_P(urDeviceGetTest, InvalidEnumerationDevicesType) {
+    uint32_t count;
+    ASSERT_EQ_RESULT(
+        UR_RESULT_ERROR_INVALID_ENUMERATION,
+        urDeviceGet(platform, UR_DEVICE_TYPE_FORCE_UINT32, 0, nullptr, &count));
+}
+
+TEST_P(urDeviceGetTest, InvalidSizeNumEntries) {
+    uint32_t count = 0;
+    ASSERT_SUCCESS(
+        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
+    ASSERT_NE(count, 0);
+    std::vector<ur_device_handle_t> devices(count);
+    ASSERT_EQ_RESULT(
+        UR_RESULT_ERROR_INVALID_SIZE,
+        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, devices.data(), nullptr));
+}
+
+TEST_P(urDeviceGetTest, InvalidNullPointerDevices) {
+    uint32_t count = 0;
+    ASSERT_SUCCESS(
+        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
+    ASSERT_NE(count, 0);
+    ASSERT_EQ_RESULT(
+        UR_RESULT_ERROR_INVALID_NULL_POINTER,
+        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, count, nullptr, nullptr));
+}
+
 struct urDeviceGetTestWithDeviceTypeParam
     : uur::urAllDevicesTest,
       ::testing::WithParamInterface<ur_device_type_t> {
@@ -70,38 +106,3 @@ TEST_P(urDeviceGetTestWithDeviceTypeParam, Success) {
         }
     }
 }
-
-TEST_F(urDeviceGetTest, InvalidNullHandlePlatform) {
-    uint32_t count;
-    ASSERT_EQ_RESULT(
-        UR_RESULT_ERROR_INVALID_NULL_HANDLE,
-        urDeviceGet(nullptr, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
-}
-
-TEST_F(urDeviceGetTest, InvalidEnumerationDevicesType) {
-    uint32_t count;
-    ASSERT_EQ_RESULT(
-        UR_RESULT_ERROR_INVALID_ENUMERATION,
-        urDeviceGet(platform, UR_DEVICE_TYPE_FORCE_UINT32, 0, nullptr, &count));
-}
-
-TEST_F(urDeviceGetTest, InvalidSizeNumEntries) {
-    uint32_t count = 0;
-    ASSERT_SUCCESS(
-        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
-    ASSERT_NE(count, 0);
-    std::vector<ur_device_handle_t> devices(count);
-    ASSERT_EQ_RESULT(
-        UR_RESULT_ERROR_INVALID_SIZE,
-        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, devices.data(), nullptr));
-}
-
-TEST_F(urDeviceGetTest, InvalidNullPointerDevices) {
-    uint32_t count = 0;
-    ASSERT_SUCCESS(
-        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
-    ASSERT_NE(count, 0);
-    ASSERT_EQ_RESULT(
-        UR_RESULT_ERROR_INVALID_NULL_POINTER,
-        urDeviceGet(platform, UR_DEVICE_TYPE_ALL, count, nullptr, nullptr));
-}
diff --git a/test/conformance/device/urDeviceGetSelected.cpp b/test/conformance/device/urDeviceGetSelected.cpp
index 953b418e24..2099d226c4 100644
--- a/test/conformance/device/urDeviceGetSelected.cpp
+++ b/test/conformance/device/urDeviceGetSelected.cpp
@@ -7,10 +7,11 @@
 #include <uur/fixtures.h>
 
 using urDeviceGetSelectedTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urDeviceGetSelectedTest);
 
 /* adpater agnostic tests -- none assume the existence or support of any specific adapter */
 
-TEST_F(urDeviceGetSelectedTest, Success) {
+TEST_P(urDeviceGetSelectedTest, Success) {
     unsetenv("ONEAPI_DEVICE_SELECTOR");
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -24,7 +25,7 @@ TEST_F(urDeviceGetSelectedTest, Success) {
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSubsetOfDevices) {
+TEST_P(urDeviceGetSelectedTest, SuccessSubsetOfDevices) {
     unsetenv("ONEAPI_DEVICE_SELECTOR");
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -41,7 +42,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSubsetOfDevices) {
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonStar) {
+TEST_P(urDeviceGetSelectedTest, SuccessSelected_StarColonStar) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:*", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -71,7 +72,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonStar) {
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonZero) {
+TEST_P(urDeviceGetSelectedTest, SuccessSelected_StarColonZero) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -85,7 +86,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonZero) {
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonZeroCommaStar) {
+TEST_P(urDeviceGetSelectedTest, SuccessSelected_StarColonZeroCommaStar) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0,*", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -99,7 +100,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSelected_StarColonZeroCommaStar) {
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSelected_DiscardStarColonStar) {
+TEST_P(urDeviceGetSelectedTest, SuccessSelected_DiscardStarColonStar) {
     setenv("ONEAPI_DEVICE_SELECTOR", "!*:*", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -107,7 +108,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSelected_DiscardStarColonStar) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, SuccessSelected_SelectAndDiscard) {
+TEST_P(urDeviceGetSelectedTest, SuccessSelected_SelectAndDiscard) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0;!*:*", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -115,7 +116,7 @@ TEST_F(urDeviceGetSelectedTest, SuccessSelected_SelectAndDiscard) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest,
+TEST_P(urDeviceGetSelectedTest,
        SuccessSelected_SelectSomethingAndDiscardSomethingElse) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0;!*:1", 1);
     uint32_t count = 0;
@@ -130,7 +131,7 @@ TEST_F(urDeviceGetSelectedTest,
     }
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidNullHandlePlatform) {
+TEST_P(urDeviceGetSelectedTest, InvalidNullHandlePlatform) {
     unsetenv("ONEAPI_DEVICE_SELECTOR");
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -138,7 +139,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidNullHandlePlatform) {
         urDeviceGetSelected(nullptr, UR_DEVICE_TYPE_ALL, 0, nullptr, &count));
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidEnumerationDevicesType) {
+TEST_P(urDeviceGetSelectedTest, InvalidEnumerationDevicesType) {
     unsetenv("ONEAPI_DEVICE_SELECTOR");
     uint32_t count = 0;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION,
@@ -146,7 +147,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidEnumerationDevicesType) {
                                          0, nullptr, &count));
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidValueNumEntries) {
+TEST_P(urDeviceGetSelectedTest, InvalidValueNumEntries) {
     unsetenv("ONEAPI_DEVICE_SELECTOR");
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -158,7 +159,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidValueNumEntries) {
                                          devices.data(), nullptr));
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidMissingBackend) {
+TEST_P(urDeviceGetSelectedTest, InvalidMissingBackend) {
     setenv("ONEAPI_DEVICE_SELECTOR", ":garbage", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -167,7 +168,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidMissingBackend) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidGarbageBackendString) {
+TEST_P(urDeviceGetSelectedTest, InvalidGarbageBackendString) {
     setenv("ONEAPI_DEVICE_SELECTOR", "garbage:0", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -176,7 +177,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidGarbageBackendString) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidMissingFilterStrings) {
+TEST_P(urDeviceGetSelectedTest, InvalidMissingFilterStrings) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -191,7 +192,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidMissingFilterStrings) {
     ASSERT_EQ(count2, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidMissingFilterString) {
+TEST_P(urDeviceGetSelectedTest, InvalidMissingFilterString) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0,,2", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -200,7 +201,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidMissingFilterString) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidTooManyDotsInFilterString) {
+TEST_P(urDeviceGetSelectedTest, InvalidTooManyDotsInFilterString) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0.1.2.3", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -209,7 +210,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidTooManyDotsInFilterString) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidBadWildardInFilterString) {
+TEST_P(urDeviceGetSelectedTest, InvalidBadWildardInFilterString) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:*.", 1);
     uint32_t count = 0;
     ASSERT_EQ_RESULT(
@@ -224,7 +225,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidBadWildardInFilterString) {
     ASSERT_EQ(count2, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidSelectingNonexistentDevice) {
+TEST_P(urDeviceGetSelectedTest, InvalidSelectingNonexistentDevice) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:4321", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -232,7 +233,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidSelectingNonexistentDevice) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidSelectingNonexistentSubDevice) {
+TEST_P(urDeviceGetSelectedTest, InvalidSelectingNonexistentSubDevice) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0.4321", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
@@ -240,7 +241,7 @@ TEST_F(urDeviceGetSelectedTest, InvalidSelectingNonexistentSubDevice) {
     ASSERT_EQ(count, 0);
 }
 
-TEST_F(urDeviceGetSelectedTest, InvalidSelectingNonexistentSubSubDevice) {
+TEST_P(urDeviceGetSelectedTest, InvalidSelectingNonexistentSubSubDevice) {
     setenv("ONEAPI_DEVICE_SELECTOR", "*:0.0.4321", 1);
     uint32_t count = 0;
     ASSERT_SUCCESS(
diff --git a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp
index f7617a2940..040634898f 100644
--- a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp
+++ b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp
@@ -148,8 +148,9 @@ struct urMultiDeviceContextMemBufferTest : urMultiDeviceContextTest {
     std::vector<ur_kernel_handle_t> kernels;
     std::vector<ur_program_metadata_t> metadatas{};
 };
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urMultiDeviceContextMemBufferTest);
 
-TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) {
+TEST_P(urMultiDeviceContextMemBufferTest, WriteRead) {
     if (num_devices == 1) {
         GTEST_SKIP();
     }
@@ -173,7 +174,7 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) {
     }
 }
 
-TEST_F(urMultiDeviceContextMemBufferTest, FillRead) {
+TEST_P(urMultiDeviceContextMemBufferTest, FillRead) {
     if (num_devices == 1) {
         GTEST_SKIP();
     }
@@ -197,7 +198,7 @@ TEST_F(urMultiDeviceContextMemBufferTest, FillRead) {
     }
 }
 
-TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) {
+TEST_P(urMultiDeviceContextMemBufferTest, WriteKernelRead) {
     if (num_devices == 1) {
         GTEST_SKIP();
     }
@@ -233,7 +234,7 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) {
     }
 }
 
-TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) {
+TEST_P(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) {
     if (num_devices == 1) {
         GTEST_SKIP();
     }
diff --git a/test/conformance/platform/CMakeLists.txt b/test/conformance/platform/CMakeLists.txt
index eec1e9bee3..e07689324b 100644
--- a/test/conformance/platform/CMakeLists.txt
+++ b/test/conformance/platform/CMakeLists.txt
@@ -3,7 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-add_conformance_test(platform
+add_conformance_test_with_platform_environment(platform
     urPlatformCreateWithNativeHandle.cpp
     urPlatformGet.cpp
     urPlatformGetApiVersion.cpp
diff --git a/test/conformance/platform/fixtures.h b/test/conformance/platform/fixtures.h
index a11426b25c..60928d50c0 100644
--- a/test/conformance/platform/fixtures.h
+++ b/test/conformance/platform/fixtures.h
@@ -7,76 +7,42 @@
 #define UR_CONFORMANCE_PLATFORM_FIXTURES_H_INCLUDED
 
 #include <uur/fixtures.h>
+#include <uur/utils.h>
+
 namespace uur {
 namespace platform {
 
-struct urTest : ::testing::Test {
-
-    void SetUp() override {
-        ur_device_init_flags_t device_flags = 0;
-        ASSERT_SUCCESS(urLoaderConfigCreate(&loader_config));
-        ASSERT_SUCCESS(urLoaderConfigEnableLayer(loader_config,
-                                                 "UR_LAYER_FULL_VALIDATION"));
-        ASSERT_SUCCESS(urLoaderInit(device_flags, loader_config));
-
-        uint32_t adapter_count;
-        ASSERT_SUCCESS(urAdapterGet(0, nullptr, &adapter_count));
-        adapters.resize(adapter_count);
-        ASSERT_SUCCESS(urAdapterGet(adapter_count, adapters.data(), nullptr));
-    }
-
-    void TearDown() override {
-        for (auto adapter : adapters) {
-            ASSERT_SUCCESS(urAdapterRelease(adapter));
-        }
-        if (loader_config) {
-            ASSERT_SUCCESS(urLoaderConfigRelease(loader_config));
-        }
-        ASSERT_SUCCESS(urLoaderTearDown());
-    }
-
-    ur_loader_config_handle_t loader_config = nullptr;
-    std::vector<ur_adapter_handle_t> adapters;
-};
-
-struct urPlatformsTest : urTest {
-
-    void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urTest::SetUp());
-        uint32_t count;
-        ASSERT_SUCCESS(urPlatformGet(adapters.data(),
-                                     static_cast<uint32_t>(adapters.size()), 0,
-                                     nullptr, &count));
-        ASSERT_NE(count, 0);
-        platforms.resize(count);
-        ASSERT_SUCCESS(urPlatformGet(adapters.data(),
-                                     static_cast<uint32_t>(adapters.size()),
-                                     count, platforms.data(), nullptr));
-    }
-
-    std::vector<ur_platform_handle_t> platforms;
+template <class T>
+struct urPlatformTestWithParam
+    : ::testing::Test,
+      ::testing::WithParamInterface<std::tuple<ur_platform_handle_t, T>> {
+    void SetUp() override { platform = std::get<0>(this->GetParam()); }
+    const T &getParam() const { return std::get<1>(this->GetParam()); }
+    ur_platform_handle_t platform;
 };
 
-struct urPlatformTest : urPlatformsTest {
+template <class T>
+std::string platformTestWithParamPrinter(
+    const ::testing::TestParamInfo<std::tuple<ur_platform_handle_t, T>> &info) {
+    auto platform = std::get<0>(info.param);
+    auto param = std::get<1>(info.param);
 
-    void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformsTest::SetUp());
-        ASSERT_GE(platforms.size(), 1);
-        platform = platforms[0]; // TODO - which to choose?
-    }
+    std::stringstream ss;
+    ss << param;
+    return uur::GetPlatformNameWithID(platform) + "__" +
+           GTestSanitizeString(ss.str());
+}
 
-    ur_platform_handle_t platform;
-};
+} // namespace platform
+} // namespace uur
 
-#define UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(FIXTURE)                         \
+#define UUR_PLATFORM_TEST_SUITE_P(FIXTURE, VALUES, TYPE)                       \
     INSTANTIATE_TEST_SUITE_P(                                                  \
         , FIXTURE,                                                             \
-        ::testing::ValuesIn(uur::DevicesEnvironment::instance->devices),       \
-        [](const ::testing::TestParamInfo<ur_device_handle_t> &info) {         \
-            return uur::GetPlatformAndDeviceName(info.param);                  \
-        })
-
-} // namespace platform
-} // namespace uur
+        testing::Combine(                                                      \
+            ::testing::ValuesIn(                                               \
+                uur::PlatformEnvironment::instance->all_platforms),            \
+            VALUES),                                                           \
+        uur::platform::platformTestWithParamPrinter<TYPE>)
 
 #endif // UR_CONFORMANCE_PLATFORM_FIXTURES_H_INCLUDED
diff --git a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
index 6b56f9b661..763c6efcac 100644
--- a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
+++ b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
@@ -3,87 +3,88 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-using urPlatformCreateWithNativeHandleTest = uur::platform::urPlatformTest;
+struct urPlatformCreateWithNativeHandleTest : uur::urPlatformTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urPlatformTest::SetUp());
+        ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_ADAPTER,
+                                         sizeof(ur_adapter_handle_t), &adapter,
+                                         nullptr));
+    }
+    ur_adapter_handle_t adapter = nullptr;
+};
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urPlatformCreateWithNativeHandleTest);
 
-TEST_F(urPlatformCreateWithNativeHandleTest, Success) {
-    for (auto platform : platforms) {
-        ur_native_handle_t native_handle = 0;
+TEST_P(urPlatformCreateWithNativeHandleTest, Success) {
+    ur_native_handle_t native_handle = 0;
 
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformGetNativeHandle(platform, &native_handle));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urPlatformGetNativeHandle(platform, &native_handle));
 
-        // We cannot assume anything about a native_handle, not even if it's
-        // `nullptr` since this could be a valid representation within a backend.
-        // We can however convert the native_handle back into a unified-runtime
-        // handle and perform some query on it to verify that it works.
-        ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
-            native_handle, adapters[0], nullptr, &plat));
-        ASSERT_NE(plat, nullptr);
+    // We cannot assume anything about a native_handle, not even if it's
+    // `nullptr` since this could be a valid representation within a backend.
+    // We can however convert the native_handle back into a unified-runtime
+    // handle and perform some query on it to verify that it works.
+    ur_platform_handle_t plat = nullptr;
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+        native_handle, adapter, nullptr, &plat));
+    ASSERT_NE(plat, nullptr);
 
-        std::string input_platform_name = uur::GetPlatformName(platform);
-        std::string created_platform_name = uur::GetPlatformName(plat);
-        ASSERT_EQ(input_platform_name, created_platform_name);
-    }
+    std::string input_platform_name = uur::GetPlatformName(platform);
+    std::string created_platform_name = uur::GetPlatformName(plat);
+    ASSERT_EQ(input_platform_name, created_platform_name);
 }
 
-TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
-    for (auto platform : platforms) {
-        ur_native_handle_t native_handle = 0;
+TEST_P(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
+    ur_native_handle_t native_handle = 0;
 
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformGetNativeHandle(platform, &native_handle));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urPlatformGetNativeHandle(platform, &native_handle));
 
-        // We cannot assume anything about a native_handle, not even if it's
-        // `nullptr` since this could be a valid representation within a backend.
-        // We can however convert the native_handle back into a unified-runtime
-        // handle and perform some query on it to verify that it works.
-        ur_platform_native_properties_t props = {
-            UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true};
-        ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
-            native_handle, adapters[0], &props, &plat));
-        ASSERT_NE(plat, nullptr);
+    // We cannot assume anything about a native_handle, not even if it's
+    // `nullptr` since this could be a valid representation within a backend.
+    // We can however convert the native_handle back into a unified-runtime
+    // handle and perform some query on it to verify that it works.
+    ur_platform_native_properties_t props = {
+        UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true};
+    ur_platform_handle_t plat = nullptr;
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+        native_handle, adapter, &props, &plat));
+    ASSERT_NE(plat, nullptr);
 
-        std::string input_platform_name = uur::GetPlatformName(platform);
-        std::string created_platform_name = uur::GetPlatformName(plat);
-        ASSERT_EQ(input_platform_name, created_platform_name);
-    }
+    std::string input_platform_name = uur::GetPlatformName(platform);
+    std::string created_platform_name = uur::GetPlatformName(plat);
+    ASSERT_EQ(input_platform_name, created_platform_name);
 }
 
-TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
-    for (auto platform : platforms) {
-        ur_native_handle_t native_handle = 0;
+TEST_P(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
+    ur_native_handle_t native_handle = 0;
 
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformGetNativeHandle(platform, &native_handle));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urPlatformGetNativeHandle(platform, &native_handle));
 
-        // We cannot assume anything about a native_handle, not even if it's
-        // `nullptr` since this could be a valid representation within a backend.
-        // We can however convert the native_handle back into a unified-runtime
-        // handle and perform some query on it to verify that it works.
-        ur_platform_native_properties_t props = {
-            UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false};
-        ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
-            native_handle, adapters[0], &props, &plat));
-        ASSERT_NE(plat, nullptr);
+    // We cannot assume anything about a native_handle, not even if it's
+    // `nullptr` since this could be a valid representation within a backend.
+    // We can however convert the native_handle back into a unified-runtime
+    // handle and perform some query on it to verify that it works.
+    ur_platform_native_properties_t props = {
+        UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false};
+    ur_platform_handle_t plat = nullptr;
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+        native_handle, adapter, &props, &plat));
+    ASSERT_NE(plat, nullptr);
 
-        std::string input_platform_name = uur::GetPlatformName(platform);
-        std::string created_platform_name = uur::GetPlatformName(plat);
-        ASSERT_EQ(input_platform_name, created_platform_name);
-    }
+    std::string input_platform_name = uur::GetPlatformName(platform);
+    std::string created_platform_name = uur::GetPlatformName(plat);
+    ASSERT_EQ(input_platform_name, created_platform_name);
 }
 
-TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) {
-    for (auto platform : platforms) {
-        ur_native_handle_t native_handle = 0;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformGetNativeHandle(platform, &native_handle));
-        ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
-                         urPlatformCreateWithNativeHandle(
-                             native_handle, adapters[0], nullptr, nullptr));
-    }
+TEST_P(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) {
+    ur_native_handle_t native_handle = 0;
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+        urPlatformGetNativeHandle(platform, &native_handle));
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urPlatformCreateWithNativeHandle(native_handle, adapter,
+                                                      nullptr, nullptr));
 }
diff --git a/test/conformance/platform/urPlatformGet.cpp b/test/conformance/platform/urPlatformGet.cpp
index 20f12c16df..b7305a066d 100644
--- a/test/conformance/platform/urPlatformGet.cpp
+++ b/test/conformance/platform/urPlatformGet.cpp
@@ -3,9 +3,12 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-using urPlatformGetTest = uur::platform::urTest;
+struct urPlatformGetTest : ::testing::Test {
+    std::vector<ur_adapter_handle_t> &adapters =
+        uur::PlatformEnvironment::instance->adapters;
+};
 
 TEST_F(urPlatformGetTest, Success) {
     uint32_t count;
diff --git a/test/conformance/platform/urPlatformGetApiVersion.cpp b/test/conformance/platform/urPlatformGetApiVersion.cpp
index 10279d560a..f3571f9a58 100644
--- a/test/conformance/platform/urPlatformGetApiVersion.cpp
+++ b/test/conformance/platform/urPlatformGetApiVersion.cpp
@@ -3,24 +3,25 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-using urPlatformGetApiVersionTest = uur::platform::urPlatformTest;
+using urPlatformGetApiVersionTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urPlatformGetApiVersionTest);
 
-TEST_F(urPlatformGetApiVersionTest, Success) {
+TEST_P(urPlatformGetApiVersionTest, Success) {
     ur_api_version_t version;
     ASSERT_EQ_RESULT(UR_RESULT_SUCCESS,
                      urPlatformGetApiVersion(platform, &version));
     ASSERT_GE(UR_API_VERSION_CURRENT, version);
 }
 
-TEST_F(urPlatformGetApiVersionTest, InvalidPlatform) {
+TEST_P(urPlatformGetApiVersionTest, InvalidPlatform) {
     ur_api_version_t version;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urPlatformGetApiVersion(nullptr, &version));
 }
 
-TEST_F(urPlatformGetApiVersionTest, InvalidVersionPtr) {
+TEST_P(urPlatformGetApiVersionTest, InvalidVersionPtr) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urPlatformGetApiVersion(platform, nullptr));
 }
diff --git a/test/conformance/platform/urPlatformGetBackendOption.cpp b/test/conformance/platform/urPlatformGetBackendOption.cpp
index 06ac618580..30d59013df 100644
--- a/test/conformance/platform/urPlatformGetBackendOption.cpp
+++ b/test/conformance/platform/urPlatformGetBackendOption.cpp
@@ -4,45 +4,45 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include "fixtures.h"
 
-struct urPlatfromGetBackendOptionTestWithParam
-    : uur::platform::urPlatformTest,
-      ::testing::WithParamInterface<std::string> {};
+using urPlatformGetBackendOptionTest =
+    uur::platform::urPlatformTestWithParam<std::string>;
 
-INSTANTIATE_TEST_SUITE_P(, urPlatfromGetBackendOptionTestWithParam,
-                         ::testing::Values("-O0", "-O1", "-O2", "-O3"),
-                         [](const ::testing::TestParamInfo<std::string> &info) {
-                             return uur::GTestSanitizeString(info.param);
-                         });
+UUR_PLATFORM_TEST_SUITE_P(urPlatformGetBackendOptionTest,
+                          ::testing::Values("-O0", "-O1", "-O2", "-O3"),
+                          std::string);
 
-TEST_P(urPlatfromGetBackendOptionTestWithParam, Success) {
+TEST_P(urPlatformGetBackendOptionTest, Success) {
     const char *platformOption = nullptr;
-    ASSERT_SUCCESS(urPlatformGetBackendOption(platform, GetParam().c_str(),
+    ASSERT_SUCCESS(urPlatformGetBackendOption(platform, getParam().c_str(),
                                               &platformOption));
     ASSERT_NE(platformOption, nullptr);
 }
 
-using urPlatfromGetBackendOptionTest = uur::platform::urPlatformTest;
+using urPlatformGetBackendOptionNegativeTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urPlatformGetBackendOptionNegativeTest);
 
-TEST_F(urPlatfromGetBackendOptionTest, InvalidNullHandle) {
+TEST_P(urPlatformGetBackendOptionNegativeTest, InvalidNullHandle) {
     const char *platformOption = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_NULL_HANDLE,
         urPlatformGetBackendOption(nullptr, "-O0", &platformOption));
 }
 
-TEST_F(urPlatfromGetBackendOptionTest, InvalidNullPointerFrontendOption) {
+TEST_P(urPlatformGetBackendOptionNegativeTest,
+       InvalidNullPointerFrontendOption) {
     const char *platformOption = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_NULL_POINTER,
         urPlatformGetBackendOption(platform, nullptr, &platformOption));
 }
 
-TEST_F(urPlatfromGetBackendOptionTest, InvalidNullPointerPlatformOption) {
+TEST_P(urPlatformGetBackendOptionNegativeTest,
+       InvalidNullPointerPlatformOption) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urPlatformGetBackendOption(platform, "-O0", nullptr));
 }
 
-TEST_F(urPlatfromGetBackendOptionTest, InvalidValueFrontendOption) {
+TEST_P(urPlatformGetBackendOptionNegativeTest, InvalidValueFrontendOption) {
     const char *platformOption = nullptr;
     ASSERT_EQ_RESULT(
         UR_RESULT_ERROR_INVALID_VALUE,
diff --git a/test/conformance/platform/urPlatformGetInfo.cpp b/test/conformance/platform/urPlatformGetInfo.cpp
index 8535b42b5b..5f3e422740 100644
--- a/test/conformance/platform/urPlatformGetInfo.cpp
+++ b/test/conformance/platform/urPlatformGetInfo.cpp
@@ -4,32 +4,24 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "fixtures.h"
+#include "uur/environment.h"
+#include "uur/fixtures.h"
 #include <cstring>
 
-struct urPlatformGetInfoTest
-    : uur::platform::urPlatformTest,
-      ::testing::WithParamInterface<ur_platform_info_t> {
+using urPlatformGetInfoTest =
+    uur::platform::urPlatformTestWithParam<ur_platform_info_t>;
 
-    void SetUp() {
-        UUR_RETURN_ON_FATAL_FAILURE(uur::platform::urPlatformTest::SetUp());
-    }
-};
-
-INSTANTIATE_TEST_SUITE_P(
-    urPlatformGetInfo, urPlatformGetInfoTest,
+UUR_PLATFORM_TEST_SUITE_P(
+    urPlatformGetInfoTest,
     ::testing::Values(UR_PLATFORM_INFO_NAME, UR_PLATFORM_INFO_VENDOR_NAME,
                       UR_PLATFORM_INFO_VERSION, UR_PLATFORM_INFO_EXTENSIONS,
                       UR_PLATFORM_INFO_PROFILE, UR_PLATFORM_INFO_BACKEND,
                       UR_PLATFORM_INFO_ADAPTER),
-    [](const ::testing::TestParamInfo<ur_platform_info_t> &info) {
-        std::stringstream ss;
-        ss << info.param;
-        return ss.str();
-    });
+    ur_platform_info_t);
 
 TEST_P(urPlatformGetInfoTest, Success) {
     size_t size = 0;
-    ur_platform_info_t info_type = GetParam();
+    ur_platform_info_t info_type = getParam();
     ASSERT_SUCCESS_OR_OPTIONAL_QUERY(
         urPlatformGetInfo(platform, info_type, 0, nullptr, &size), info_type);
     if (info_type == UR_PLATFORM_INFO_BACKEND) {
@@ -57,8 +49,11 @@ TEST_P(urPlatformGetInfoTest, Success) {
         auto queried_adapter =
             *reinterpret_cast<ur_adapter_handle_t *>(name.data());
         auto adapter_found =
-            std::find(adapters.begin(), adapters.end(), queried_adapter);
-        ASSERT_NE(adapter_found, adapters.end());
+            std::find(uur::PlatformEnvironment::instance->adapters.begin(),
+                      uur::PlatformEnvironment::instance->adapters.end(),
+                      queried_adapter);
+        ASSERT_NE(adapter_found,
+                  uur::AdapterEnvironment::instance->adapters.end());
         break;
     }
     default:
@@ -69,38 +64,41 @@ TEST_P(urPlatformGetInfoTest, Success) {
 TEST_P(urPlatformGetInfoTest, InvalidNullHandlePlatform) {
     size_t size = 0;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
-                     urPlatformGetInfo(nullptr, GetParam(), 0, nullptr, &size));
+                     urPlatformGetInfo(nullptr, getParam(), 0, nullptr, &size));
 }
 
-TEST_F(urPlatformGetInfoTest, InvalidEnumerationPlatformInfoType) {
+using urPlatformGetInfoNegativeTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urPlatformGetInfoNegativeTest);
+
+TEST_P(urPlatformGetInfoNegativeTest, InvalidEnumerationPlatformInfoType) {
     size_t size = 0;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION,
                      urPlatformGetInfo(platform, UR_PLATFORM_INFO_FORCE_UINT32,
                                        0, nullptr, &size));
 }
 
-TEST_F(urPlatformGetInfoTest, InvalidSizeZero) {
+TEST_P(urPlatformGetInfoNegativeTest, InvalidSizeZero) {
     ur_platform_backend_t backend;
     ASSERT_EQ_RESULT(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, 0,
                                        &backend, nullptr),
                      UR_RESULT_ERROR_INVALID_SIZE);
 }
 
-TEST_F(urPlatformGetInfoTest, InvalidSizeSmall) {
+TEST_P(urPlatformGetInfoNegativeTest, InvalidSizeSmall) {
     ur_platform_backend_t backend;
     ASSERT_EQ_RESULT(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
                                        sizeof(backend) - 1, &backend, nullptr),
                      UR_RESULT_ERROR_INVALID_SIZE);
 }
 
-TEST_F(urPlatformGetInfoTest, InvalidNullPointerPropValue) {
+TEST_P(urPlatformGetInfoNegativeTest, InvalidNullPointerPropValue) {
     ur_platform_backend_t backend;
     ASSERT_EQ_RESULT(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
                                        sizeof(backend), nullptr, nullptr),
                      UR_RESULT_ERROR_INVALID_NULL_POINTER);
 }
 
-TEST_F(urPlatformGetInfoTest, InvalidNullPointerPropSizeRet) {
+TEST_P(urPlatformGetInfoNegativeTest, InvalidNullPointerPropSizeRet) {
     ASSERT_EQ_RESULT(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, 0,
                                        nullptr, nullptr),
                      UR_RESULT_ERROR_INVALID_NULL_POINTER);
diff --git a/test/conformance/platform/urPlatformGetNativeHandle.cpp b/test/conformance/platform/urPlatformGetNativeHandle.cpp
index 305eed5eda..ba4e29986f 100644
--- a/test/conformance/platform/urPlatformGetNativeHandle.cpp
+++ b/test/conformance/platform/urPlatformGetNativeHandle.cpp
@@ -3,28 +3,25 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "fixtures.h"
+#include <uur/fixtures.h>
 
-using urPlatformGetNativeHandleTest = uur::platform::urPlatformsTest;
+using urPlatformGetNativeHandleTest = uur::urPlatformTest;
+UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urPlatformGetNativeHandleTest);
 
-TEST_F(urPlatformGetNativeHandleTest, Success) {
-    for (auto platform : platforms) {
-        ur_native_handle_t native_handle = 0;
-        if (auto error = urPlatformGetNativeHandle(platform, &native_handle)) {
-            ASSERT_EQ(UR_RESULT_ERROR_UNSUPPORTED_FEATURE, error);
-        }
+TEST_P(urPlatformGetNativeHandleTest, Success) {
+    ur_native_handle_t native_handle = 0;
+    if (auto error = urPlatformGetNativeHandle(platform, &native_handle)) {
+        ASSERT_EQ(UR_RESULT_ERROR_UNSUPPORTED_FEATURE, error);
     }
 }
 
-TEST_F(urPlatformGetNativeHandleTest, InvalidNullHandlePlatform) {
+TEST_P(urPlatformGetNativeHandleTest, InvalidNullHandlePlatform) {
     ur_native_handle_t native_handle = 0;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urPlatformGetNativeHandle(nullptr, &native_handle));
 }
 
-TEST_F(urPlatformGetNativeHandleTest, InvalidNullPointerNativePlatform) {
-    for (auto platform : platforms) {
-        ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
-                         urPlatformGetNativeHandle(platform, nullptr));
-    }
+TEST_P(urPlatformGetNativeHandleTest, InvalidNullPointerNativePlatform) {
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urPlatformGetNativeHandle(platform, nullptr));
 }
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index e1d1951616..037ab4c882 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -145,6 +145,7 @@ void uur::PlatformEnvironment::selectPlatformFromOptions() {
             urPlatformGet(&a, 1, count, platform_list.data(), nullptr));
 
         for (auto p : platform_list) {
+            all_platforms.push_back(p);
             ur_platform_backend_t backend;
             ASSERT_SUCCESS(urPlatformGetInfo(p, UR_PLATFORM_INFO_BACKEND,
                                              sizeof(ur_platform_backend_t),
diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h
index 397858333f..3d6c2cc0e4 100644
--- a/test/conformance/testing/include/uur/environment.h
+++ b/test/conformance/testing/include/uur/environment.h
@@ -42,6 +42,9 @@ struct PlatformEnvironment : AdapterEnvironment {
     PlatformOptions parsePlatformOptions(int argc, char **argv);
 
     PlatformOptions platform_options;
+    // List of all discovered platforms
+    std::vector<ur_platform_handle_t> all_platforms;
+    // Adapter and platform selected for testing via platform_options
     ur_adapter_handle_t adapter = nullptr;
     ur_platform_handle_t platform = nullptr;
     static PlatformEnvironment *instance;
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 5a409e465c..42896314cc 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -42,7 +42,10 @@ struct urAdapterTest : ::testing::Test,
     ur_adapter_handle_t adapter;
 };
 
-struct urPlatformTest : ::testing::Test {
+// Inherit this to get the platform/adapter that was selected via the --backend
+// and --platform arguments (or defaulted to if only one platform was
+// discovered)
+struct urSelectedPlatformTest : ::testing::Test {
     void SetUp() override {
         platform = uur::PlatformEnvironment::instance->platform;
         adapter = uur::PlatformEnvironment::instance->adapter;
@@ -52,6 +55,17 @@ struct urPlatformTest : ::testing::Test {
     ur_adapter_handle_t adapter = nullptr;
 };
 
+// In the vein of urAdapterTest and urDeviceTest this is a parameterized
+// platform fixture which can be instantiated via
+// UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P to run tests on each discovered
+// platform.
+struct urPlatformTest : ::testing::Test,
+                        ::testing::WithParamInterface<ur_platform_handle_t> {
+    void SetUp() override { platform = GetParam(); }
+
+    ur_platform_handle_t platform = nullptr;
+};
+
 inline std::pair<bool, std::vector<ur_device_handle_t>>
 GetDevices(ur_platform_handle_t platform) {
     uint32_t count = 0;
@@ -77,9 +91,9 @@ inline bool hasDevicePartitionSupport(ur_device_handle_t device,
            properties.end();
 }
 
-struct urAllDevicesTest : urPlatformTest {
+struct urAllDevicesTest : urSelectedPlatformTest {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::SetUp());
         auto devicesPair = GetDevices(platform);
         if (!devicesPair.first) {
             FAIL() << "Failed to get devices";
@@ -91,23 +105,23 @@ struct urAllDevicesTest : urPlatformTest {
         for (auto &device : devices) {
             EXPECT_SUCCESS(urDeviceRelease(device));
         }
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::TearDown());
     }
 
     std::vector<ur_device_handle_t> devices;
 };
 
-struct urDeviceTest : urPlatformTest,
+struct urDeviceTest : urSelectedPlatformTest,
                       ::testing::WithParamInterface<ur_device_handle_t> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::SetUp());
         device = GetParam();
         EXPECT_SUCCESS(urDeviceRetain(device));
     }
 
     void TearDown() override {
         EXPECT_SUCCESS(urDeviceRelease(device));
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::SetUp());
     }
 
     ur_device_handle_t device;
@@ -122,6 +136,15 @@ struct urDeviceTest : urPlatformTest,
             return uur::GetAdapterBackendName(info.param);                     \
         })
 
+#define UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(FIXTURE)                         \
+    INSTANTIATE_TEST_SUITE_P(                                                  \
+        , FIXTURE,                                                             \
+        ::testing::ValuesIn(                                                   \
+            uur::PlatformEnvironment::instance->all_platforms),                \
+        [](const ::testing::TestParamInfo<ur_platform_handle_t> &info) {       \
+            return uur::GetPlatformNameWithID(info.param);                     \
+        })
+
 #define UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(FIXTURE)                           \
     INSTANTIATE_TEST_SUITE_P(                                                  \
         , FIXTURE,                                                             \
@@ -142,10 +165,10 @@ namespace uur {
 
 template <class T>
 struct urDeviceTestWithParam
-    : urPlatformTest,
+    : urSelectedPlatformTest,
       ::testing::WithParamInterface<std::tuple<ur_device_handle_t, T>> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::SetUp());
         device = std::get<0>(this->GetParam());
     }
     // TODO - I don't like the confusion with GetParam();
@@ -561,9 +584,9 @@ struct urMultiQueueTestWithParam : urContextTestWithParam<T> {
 };
 
 template <size_t MinDevices = 2>
-struct urMultiDeviceContextTestTemplate : urPlatformTest {
+struct urMultiDeviceContextTestTemplate : urSelectedPlatformTest {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::SetUp());
         auto &devices = DevicesEnvironment::instance->devices;
         if (devices.size() < MinDevices) {
             GTEST_SKIP();
@@ -576,7 +599,7 @@ struct urMultiDeviceContextTestTemplate : urPlatformTest {
         if (context) {
             ASSERT_SUCCESS(urContextRelease(context));
         }
-        UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urSelectedPlatformTest::TearDown());
     }
 
     ur_context_handle_t context = nullptr;
diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h
index 2415e580c5..76b0cfe31d 100644
--- a/test/conformance/testing/include/uur/utils.h
+++ b/test/conformance/testing/include/uur/utils.h
@@ -190,6 +190,15 @@ inline std::string GetPlatformName(ur_platform_handle_t hPlatform) {
         std::string(platform_name.data(), platform_name.size()));
 }
 
+inline std::string GetPlatformNameWithID(ur_platform_handle_t hPlatform) {
+    auto platform_name = GetPlatformName(hPlatform);
+    auto &platforms = uur::PlatformEnvironment::instance->all_platforms;
+    size_t platform_id =
+        std::find(platforms.begin(), platforms.end(), hPlatform) -
+        platforms.begin();
+    return platform_name + "_ID" + std::to_string(platform_id);
+}
+
 inline std::string GetDeviceName(ur_device_handle_t device) {
     std::string device_name, device_uuid;
     GetDeviceInfo<std::string>(device, UR_DEVICE_INFO_NAME, device_name);

From 59b37e3fd00a4f9318fb20a9ca817f254b4b089e Mon Sep 17 00:00:00 2001
From: "Mestre, Fabio" <fabio.mestre@intel.com>
Date: Fri, 20 Dec 2024 17:01:38 +0100
Subject: [PATCH 148/148] Update usage of
 zeCommandListImmediateAppendCommandListsExp to use dlsym

The implementation was using zeCommandListImmediateAppendCommandListsExp
directly with the loader. This creates an issue with old loaders that don't
support this entrypoint. Instead, this change uses dlsym to load the
function if available.
---
 source/adapters/level_zero/command_buffer.cpp | 25 +++++++++----------
 source/adapters/level_zero/platform.cpp       | 24 ++++++++++++++++++
 source/adapters/level_zero/platform.hpp       |  9 ++++++-
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index b96aeeede9..c4d9614159 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -26,14 +26,9 @@ namespace {
 // given Context and Device.
 bool checkImmediateAppendSupport(ur_context_handle_t Context,
                                  ur_device_handle_t Device) {
-  // TODO The L0 driver is not reporting this extension yet. Once it does,
-  // switch to using the variable zeDriverImmediateCommandListAppendFound.
 
-  // Minimum version that supports zeCommandListImmediateAppendCommandListsExp.
-  constexpr uint32_t MinDriverVersion = 30898;
   bool DriverSupportsImmediateAppend =
-      Context->getPlatform()->isDriverVersionNewerOrSimilar(1, 3,
-                                                            MinDriverVersion);
+      Context->getPlatform()->ZeCommandListImmediateAppendExt.Supported;
 
   // If this environment variable is:
   //   - Set to 1: the immediate append path will always be enabled as long the
@@ -58,10 +53,8 @@ bool checkImmediateAppendSupport(ur_context_handle_t Context,
     if (EnableAppendPath && !DriverSupportsImmediateAppend) {
       logger::error("{} is set but "
                     "the current driver does not support the "
-                    "zeCommandListImmediateAppendCommandListsExp entrypoint. A "
-                    "driver version of at least {} is required to use the "
-                    "immediate append path.",
-                    AppendEnvVarName, MinDriverVersion);
+                    "zeCommandListImmediateAppendCommandListsExp entrypoint.",
+                    AppendEnvVarName);
       std::abort();
     }
 
@@ -1569,7 +1562,10 @@ ur_result_t enqueueImmediateAppendPath(
     ur_event_handle_t *Event, ur_command_list_ptr_t CommandListHelper,
     bool DoProfiling) {
 
+  ur_platform_handle_t Platform = CommandBuffer->Context->getPlatform();
+
   assert(CommandListHelper->second.IsImmediate);
+  assert(Platform->ZeCommandListImmediateAppendExt.Supported);
 
   _ur_ze_event_list_t UrZeEventList;
   if (NumEventsInWaitList) {
@@ -1587,7 +1583,8 @@ ur_result_t enqueueImmediateAppendPath(
         nullptr /*ForcedCmdQueue*/));
     assert(ZeCopyEngineImmediateListHelper->second.IsImmediate);
 
-    ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp,
+    ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt
+                   .zeCommandListImmediateAppendCommandListsExp,
                (ZeCopyEngineImmediateListHelper->first, 1,
                 &CommandBuffer->ZeCopyCommandList, nullptr,
                 UrZeEventList.Length, UrZeEventList.ZeEventList));
@@ -1599,7 +1596,8 @@ ur_result_t enqueueImmediateAppendPath(
   ze_event_handle_t &EventToSignal =
       DoProfiling ? CommandBuffer->ComputeFinishedEvent->ZeEvent
                   : (*Event)->ZeEvent;
-  ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp,
+  ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt
+                 .zeCommandListImmediateAppendCommandListsExp,
              (CommandListHelper->first, 1, &CommandBuffer->ZeComputeCommandList,
               EventToSignal, WaitList.Length, WaitList.ZeEventList));
 
@@ -1616,7 +1614,8 @@ ur_result_t enqueueImmediateAppendPath(
                (CommandListHelper->first,
                 CommandBuffer->ExecutionFinishedEvent->ZeEvent, 0, nullptr));
 
-    ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp,
+    ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt
+                   .zeCommandListImmediateAppendCommandListsExp,
                (CommandListHelper->first, 1,
                 &CommandBuffer->ZeCommandListResetEvents, nullptr, 0, nullptr));
   }
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index 6ae1deaabc..26b5a03ed6 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -224,6 +224,7 @@ ur_result_t ur_platform_handle_t_::initialize() {
 
   bool MutableCommandListSpecExtensionSupported = false;
   bool ZeIntelExternalSemaphoreExtensionSupported = false;
+  bool ZeImmediateCommandListAppendExtensionFound = false;
   for (auto &extension : ZeExtensions) {
     // Check if global offset extension is available
     if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
@@ -248,6 +249,14 @@ ur_result_t ur_platform_handle_t_::initialize() {
         ZeDriverEventPoolCountingEventsExtensionFound = true;
       }
     }
+    // Check if the ImmediateAppendCommandLists extension is available.
+    if (strncmp(extension.name, ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME,
+                strlen(ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME) + 1) == 0) {
+      if (extension.version ==
+          ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_VERSION_CURRENT) {
+        ZeImmediateCommandListAppendExtensionFound = true;
+      }
+    }
     // Check if extension is available for Mutable Command List v1.1.
     if (strncmp(extension.name, ZE_MUTABLE_COMMAND_LIST_EXP_NAME,
                 strlen(ZE_MUTABLE_COMMAND_LIST_EXP_NAME) + 1) == 0) {
@@ -427,6 +436,21 @@ ur_result_t ur_platform_handle_t_::initialize() {
                   &ZeMutableCmdListExt
                        .zexCommandListGetNextCommandIdWithKernelsExp))) == 0);
   }
+
+  // Check if ImmediateAppendCommandList is supported and initialize the
+  // function pointer.
+  if (ZeImmediateCommandListAppendExtensionFound) {
+    ZeCommandListImmediateAppendExt
+        .zeCommandListImmediateAppendCommandListsExp =
+        (ze_pfnCommandListImmediateAppendCommandListsExp_t)
+            ur_loader::LibLoader::getFunctionPtr(
+                GlobalAdapter->processHandle,
+                "zeCommandListImmediateAppendCommandListsExp");
+    ZeCommandListImmediateAppendExt.Supported =
+        ZeCommandListImmediateAppendExt
+            .zeCommandListImmediateAppendCommandListsExp != nullptr;
+  }
+
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp
index 748460158c..0faa122651 100644
--- a/source/adapters/level_zero/platform.hpp
+++ b/source/adapters/level_zero/platform.hpp
@@ -134,4 +134,11 @@ struct ur_platform_handle_t_ : public _ur_platform {
     ze_result_t (*zexDeviceReleaseExternalSemaphoreExp)(
         ze_intel_external_semaphore_exp_handle_t);
   } ZeExternalSemaphoreExt;
-};
\ No newline at end of file
+
+  struct ZeCommandListImmediateAppendExtension {
+    bool Supported = false;
+    ze_result_t (*zeCommandListImmediateAppendCommandListsExp)(
+        ze_command_list_handle_t, uint32_t, ze_command_list_handle_t *,
+        ze_event_handle_t, uint32_t, ze_event_handle_t *);
+  } ZeCommandListImmediateAppendExt;
+};