From f2070c05c0101c502aadf8359ae4361225995cfb Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Fri, 17 Jun 2022 15:59:34 -0700 Subject: [PATCH 01/13] Initial Level 0 support. Profiling works, but tracing timestamps are quite bogus. --- CMakeLists.txt | 11 + cmake/Modules/APEX_DefaultOptions.cmake | 1 + cmake/Modules/FindLEVEL0.cmake | 35 + src/apex/CMakeLists.standalone | 5 + src/apex/L0/demangle.h | 42 + src/apex/L0/pti_assert.h | 17 + src/apex/L0/utils.h | 220 ++ src/apex/L0/ze_api_callbacks.h | 4467 +++++++++++++++++++++++ src/apex/L0/ze_api_collector.h | 203 + src/apex/L0/ze_kernel_collector.h | 726 ++++ src/apex/L0/ze_utils.h | 392 ++ src/apex/apex_level0.cpp | 386 ++ src/apex/async_thread_node.hpp | 29 + src/apex/utils.hpp | 5 +- src/scripts/apex_exec | 6 + 15 files changed, 6543 insertions(+), 2 deletions(-) create mode 100644 cmake/Modules/FindLEVEL0.cmake create mode 100644 src/apex/L0/demangle.h create mode 100644 src/apex/L0/pti_assert.h create mode 100644 src/apex/L0/utils.h create mode 100644 src/apex/L0/ze_api_callbacks.h create mode 100644 src/apex/L0/ze_api_collector.h create mode 100644 src/apex/L0/ze_kernel_collector.h create mode 100644 src/apex/L0/ze_utils.h create mode 100644 src/apex/apex_level0.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e0df0ae..c3f351d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,6 +167,17 @@ if(APEX_WITH_HIP) endif() endif(APEX_WITH_HIP) +if(APEX_WITH_LEVEL0) + find_package(LEVEL0 REQUIRED) + if (LEVEL0_FOUND) + include_directories(${LEVEL0_INCLUDE_DIRS}) + set(LIBS ${LIBS} ${LEVEL0_LIBRARIES}) + if (NOT BUILD_STATIC_EXECUTABLES) + set (CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_RPATH} ${LEVEL0_LIBRARY_DIR}) + endif() + endif() +endif(APEX_WITH_LEVEL0) + # set(APEX_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING "Configuration type (one of Debug, RelWithDebInfo, Release, MinSizeRel)" FORCE) # Always force CMAKE_CONFIGURATION_TYPES to be the same as CMAKE_BUILD_TYPE diff --git a/cmake/Modules/APEX_DefaultOptions.cmake b/cmake/Modules/APEX_DefaultOptions.cmake index e4daf2ef..6a8bc8ea 100644 --- a/cmake/Modules/APEX_DefaultOptions.cmake +++ b/cmake/Modules/APEX_DefaultOptions.cmake @@ -7,6 +7,7 @@ option (APEX_WITH_ACTIVEHARMONY "Enable ActiveHarmony support" FALSE) option (APEX_WITH_BFD "Enable Binutils (BFD)support" FALSE) option (APEX_WITH_CUDA "Enable CUDA (CUPTI) support" FALSE) option (APEX_WITH_HIP "Enable HIP (ROCTRACER) support" FALSE) +option (APEX_WITH_LEVEL0 "Enable LEVEL0 (Intel OneAPI) support" FALSE) option (APEX_WITH_MPI "Enable MPI support" FALSE) option (APEX_WITH_OMPT "Enable OpenMP Tools (OMPT) support" FALSE) option (APEX_WITH_OTF2 "Enable Open Trace Format 2 (OTF2) support" FALSE) diff --git a/cmake/Modules/FindLEVEL0.cmake b/cmake/Modules/FindLEVEL0.cmake new file mode 100644 index 00000000..063496ea --- /dev/null +++ b/cmake/Modules/FindLEVEL0.cmake @@ -0,0 +1,35 @@ +# - Try to find LibLEVEL0 +# Once done this will define +# LEVEL0_FOUND - System has LEVEL0 +# LEVEL0_INCLUDE_DIRS - The LEVEL0 include directories +# LEVEL0_LIBRARIES - The libraries needed to use LEVEL0 +# LEVEL0_DEFINITIONS - Compiler switches required for using LEVEL0 + +if(NOT DEFINED $LEVEL0_ROOT) + if(DEFINED ENV{LEVEL0_ROOT}) + # message(" env LEVEL0_ROOT is defined as $ENV{LEVEL0_ROOT}") + set(LEVEL0_ROOT $ENV{LEVEL0_ROOT}) + endif() +endif() + +find_path(LEVEL0_INCLUDE_DIR NAMES level_zero/ze_api.h + HINTS ${LEVEL0_ROOT}/include /usr ${LEVEL0_ROOT}) + +find_library(LEVEL0_LIBRARY NAMES ze_loader + HINTS ${LEVEL0_ROOT} ${LEVEL0_ROOT}/lib64 ${LEVEL0_ROOT}/lib /usr/lib64 /usr/lib) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set LEVEL0_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(LEVEL0 DEFAULT_MSG + LEVEL0_LIBRARY LEVEL0_INCLUDE_DIR) + +mark_as_advanced(LEVEL0_INCLUDE_DIR LEVEL0_LIBRARY) + +if(LEVEL0_FOUND) + set(LEVEL0_LIBRARIES ${LEVEL0_LIBRARY} ) + set(LEVEL0_INCLUDE_DIRS ${LEVEL0_INCLUDE_DIR}) + set(LEVEL0_DIR ${LEVEL0_ROOT}) + add_definitions(-DAPEX_HAVE_LEVEL0) +endif() + diff --git a/src/apex/CMakeLists.standalone b/src/apex/CMakeLists.standalone index 3312ef0e..b1d5bcfa 100644 --- a/src/apex/CMakeLists.standalone +++ b/src/apex/CMakeLists.standalone @@ -37,6 +37,10 @@ if (CUPTI_FOUND) SET(CUPTI_SOURCE cupti_trace.cpp) endif(CUPTI_FOUND) +if (LEVEL0_FOUND) +SET(LEVEL0_SOURCE apex_level0.cpp) +endif(LEVEL0_FOUND) + if (ROCTRACER_FOUND) SET(ROCTRACER_SOURCE hip_trace.cpp) endif(ROCTRACER_FOUND) @@ -68,6 +72,7 @@ endif (APEX_WITH_RAJA AND RAJA_FOUND) # Try to keep this in alphabetical order SET(all_SOURCE ${CUPTI_SOURCE} +${LEVEL0_SOURCE} ${ROCPROFILER_SOURCE} ${ROCTRACER_SOURCE} ${NVML_SOURCE} diff --git a/src/apex/L0/demangle.h b/src/apex/L0/demangle.h new file mode 100644 index 00000000..afff205b --- /dev/null +++ b/src/apex/L0/demangle.h @@ -0,0 +1,42 @@ +#pragma once + +#if __has_include() +#define HAVE_CXXABI 1 +#include +#include +#else +#define HAVE_CXXABI 0 +#endif +#include + +#include "pti_assert.h" + +namespace utils { + + static inline std::string Demangle(const char* name) { + PTI_ASSERT(name != nullptr); + +#if HAVE_CXXABI + int status = 0; + char* demangled = abi::__cxa_demangle(name, nullptr, 0, &status); + if (status != 0) { + return name; + } + + constexpr const char* const prefix_to_skip = "typeinfo name for "; + const size_t prefix_to_skip_len = strlen(prefix_to_skip); + const size_t shift = + (std::strncmp(demangled, prefix_to_skip, prefix_to_skip_len) == 0) ? + prefix_to_skip_len : 0; + + std::string result(demangled + shift); + free(demangled); + return result; +#else + return name; +#endif + } + +} // namespace utils + +#undef HAVE_CXXABI diff --git a/src/apex/L0/pti_assert.h b/src/apex/L0/pti_assert.h new file mode 100644 index 00000000..f0bfef04 --- /dev/null +++ b/src/apex/L0/pti_assert.h @@ -0,0 +1,17 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#ifdef NDEBUG +#undef NDEBUG +#include +#define NDEBUG +#else +#include +#endif + +#define PTI_ASSERT(X) assert(X) diff --git a/src/apex/L0/utils.h b/src/apex/L0/utils.h new file mode 100644 index 00000000..9def9b00 --- /dev/null +++ b/src/apex/L0/utils.h @@ -0,0 +1,220 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#if defined(_WIN32) +#include +#else +#include +#include +#endif + +#include + +#include +#include +#include + +#include "pti_assert.h" + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#define MAX_STR_SIZE 1024 + +#define BYTES_IN_MBYTES (1024 * 1024) + +#define NSEC_IN_USEC 1000 +#define MSEC_IN_SEC 1000 +#define NSEC_IN_MSEC 1000000 +#define NSEC_IN_SEC 1000000000 + +namespace utils { + + struct Comparator { + template + bool operator()(const T& left, const T& right) const { + if (left.second != right.second) { + return left.second > right.second; + } + return left.first > right.first; + } + }; + +#if defined(__gnu_linux__) + + inline uint64_t GetTime(clockid_t id) { + timespec ts{0}; + int status = clock_gettime(id, &ts); + PTI_ASSERT(status == 0); + return ts.tv_sec * NSEC_IN_SEC + ts.tv_nsec; + } + + inline uint64_t ConvertClockMonotonicToRaw(uint64_t clock_monotonic) { + uint64_t raw = GetTime(CLOCK_MONOTONIC_RAW); + uint64_t monotonic = GetTime(CLOCK_MONOTONIC); + return (raw > monotonic) ? + clock_monotonic + (raw - monotonic) : + clock_monotonic - (monotonic - raw); + } + +#endif + + inline std::string GetFilePath(const std::string& filename) { + PTI_ASSERT(!filename.empty()); + + size_t pos = filename.find_last_of("/\\"); + if (pos == std::string::npos) { + return ""; + } + + return filename.substr(0, pos + 1); + } + + inline std::string GetExecutablePath() { + char buffer[MAX_STR_SIZE] = { 0 }; +#if defined(_WIN32) + DWORD status = GetModuleFileNameA(nullptr, buffer, MAX_STR_SIZE); + PTI_ASSERT(status > 0); +#else + ssize_t status = readlink("/proc/self/exe", buffer, MAX_STR_SIZE); + PTI_ASSERT(status > 0); +#endif + return GetFilePath(buffer); + } + + inline std::string GetExecutableName() { + char buffer[MAX_STR_SIZE] = { 0 }; +#if defined(_WIN32) + DWORD status = GetModuleFileNameA(nullptr, buffer, MAX_STR_SIZE); + PTI_ASSERT(status > 0); +#else + ssize_t status = readlink("/proc/self/exe", buffer, MAX_STR_SIZE); + PTI_ASSERT(status > 0); +#endif + std::string path(buffer); + return path.substr(path.find_last_of("/\\") + 1); + } + + inline std::vector LoadBinaryFile(const std::string& path) { + std::vector binary; + std::ifstream stream(path, std::ios::in | std::ios::binary); + if (!stream.good()) { + return binary; + } + + stream.seekg(0, std::ifstream::end); + size_t size = stream.tellg(); + stream.seekg(0, std::ifstream::beg); + if (size == 0) { + return binary; + } + + binary.resize(size); + stream.read(reinterpret_cast(binary.data()), size); + return binary; + } + + inline void SetEnv(const char* name, const char* value) { + PTI_ASSERT(name != nullptr); + PTI_ASSERT(value != nullptr); + + int status = 0; +#if defined(_WIN32) + std::string str = std::string(name) + "=" + value; + status = _putenv(str.c_str()); +#else + status = setenv(name, value, 1); +#endif + PTI_ASSERT(status == 0); + } + + inline std::string GetEnv(const char* name) { + PTI_ASSERT(name != nullptr); +#if defined(_WIN32) + char* value = nullptr; + errno_t status = _dupenv_s(&value, nullptr, name); + PTI_ASSERT(status == 0); + if (value == nullptr) { + return std::string(); + } + std::string result(value); + free(value); + return result; +#else + const char* value = getenv(name); + if (value == nullptr) { + return std::string(); + } + return std::string(value); +#endif + } + + inline uint32_t GetPid() { +#if defined(_WIN32) + return GetCurrentProcessId(); +#else + return getpid(); +#endif + } + + inline uint32_t GetTid() { +#if defined(_WIN32) + return GetCurrentThreadId(); +#else +#ifdef SYS_gettid + return syscall(SYS_gettid); +#else +#error "SYS_gettid is unavailable on this system" +#endif +#endif + } + + inline uint64_t GetSystemTime() { +#if defined(_WIN32) + LARGE_INTEGER ticks{0}; + LARGE_INTEGER frequency{0}; + BOOL status = QueryPerformanceFrequency(&frequency); + PTI_ASSERT(status != 0); + status = QueryPerformanceCounter(&ticks); + PTI_ASSERT(status != 0); + return ticks.QuadPart * (NSEC_IN_SEC / frequency.QuadPart); +#else + return GetTime(CLOCK_MONOTONIC_RAW); +#endif + } + + inline size_t LowerBound(const std::vector& data, uint64_t value) { + size_t start = 0; + size_t end = data.size(); + while (start < end) { + size_t middle = (start + end) / 2; + if (value <= data[middle]) { + end = middle; + } else { + start = middle + 1; + } + } + return start; + } + + inline size_t UpperBound(const std::vector& data, uint64_t value) { + size_t start = 0; + size_t end = data.size(); + while (start < end) { + size_t middle = (start + end) / 2; + if (value >= data[middle]) { + start = middle + 1; + } else { + end = middle; + } + } + return start; + } + +} // namespace utils + diff --git a/src/apex/L0/ze_api_callbacks.h b/src/apex/L0/ze_api_callbacks.h new file mode 100644 index 00000000..934c2ee7 --- /dev/null +++ b/src/apex/L0/ze_api_callbacks.h @@ -0,0 +1,4467 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#include "pti_assert.h" + +static void zeInitOnEnter( + ze_init_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeInitOnExit( + ze_init_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeInit", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeInit", + start_time, end_time); + } + +} + +static void zeDriverGetOnEnter( + ze_driver_get_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDriverGetOnExit( + ze_driver_get_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDriverGet", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDriverGet", + start_time, end_time); + } +} + +static void zeDriverGetApiVersionOnEnter( + ze_driver_get_api_version_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDriverGetApiVersionOnExit( + ze_driver_get_api_version_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDriverGetApiVersion", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDriverGetApiVersion", + start_time, end_time); + } +} + +static void zeDriverGetPropertiesOnEnter( + ze_driver_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDriverGetPropertiesOnExit( + ze_driver_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDriverGetProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDriverGetProperties", + start_time, end_time); + } +} + +static void zeDriverGetIpcPropertiesOnEnter( + ze_driver_get_ipc_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDriverGetIpcPropertiesOnExit( + ze_driver_get_ipc_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDriverGetIpcProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDriverGetIpcProperties", + start_time, end_time); + } +} + +static void zeDriverGetExtensionPropertiesOnEnter( + ze_driver_get_extension_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDriverGetExtensionPropertiesOnExit( + ze_driver_get_extension_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDriverGetExtensionProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDriverGetExtensionProperties", + start_time, end_time); + } +} + +static void zeDeviceGetOnEnter( + ze_device_get_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetOnExit( + ze_device_get_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGet", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGet", + start_time, end_time); + } +} + +static void zeDeviceGetSubDevicesOnEnter( + ze_device_get_sub_devices_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetSubDevicesOnExit( + ze_device_get_sub_devices_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetSubDevices", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetSubDevices", + start_time, end_time); + } +} + +static void zeDeviceGetPropertiesOnEnter( + ze_device_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetPropertiesOnExit( + ze_device_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetSubDevices", + start_time, end_time); + } +} + +static void zeDeviceGetComputePropertiesOnEnter( + ze_device_get_compute_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetComputePropertiesOnExit( + ze_device_get_compute_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetComputeProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetComputeProperties", + start_time, end_time); + } +} + +static void zeDeviceGetModulePropertiesOnEnter( + ze_device_get_module_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetModulePropertiesOnExit( + ze_device_get_module_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetModuleProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetModuleProperties", + start_time, end_time); + } +} + +static void zeDeviceGetCommandQueueGroupPropertiesOnEnter( + ze_device_get_command_queue_group_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetCommandQueueGroupPropertiesOnExit( + ze_device_get_command_queue_group_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetCommandQueueGroupProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetCommandQueueGroupProperties", + start_time, end_time); + } +} + +static void zeDeviceGetMemoryPropertiesOnEnter( + ze_device_get_memory_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetMemoryPropertiesOnExit( + ze_device_get_memory_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetMemoryProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetMemoryProperties", + start_time, end_time); + } +} + +static void zeDeviceGetMemoryAccessPropertiesOnEnter( + ze_device_get_memory_access_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetMemoryAccessPropertiesOnExit( + ze_device_get_memory_access_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetMemoryAccessProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetMemoryAccessProperties", + start_time, end_time); + } +} + +static void zeDeviceGetCachePropertiesOnEnter( + ze_device_get_cache_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetCachePropertiesOnExit( + ze_device_get_cache_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetCacheProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetCacheProperties", + start_time, end_time); + } +} + +static void zeDeviceGetImagePropertiesOnEnter( + ze_device_get_image_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetImagePropertiesOnExit( + ze_device_get_image_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetImageProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetImageProperties", + start_time, end_time); + } +} + +static void zeDeviceGetExternalMemoryPropertiesOnEnter( + ze_device_get_external_memory_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetExternalMemoryPropertiesOnExit( + ze_device_get_external_memory_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetExternalMemoryProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetExternalMemoryProperties", + start_time, end_time); + } +} + +static void zeDeviceGetP2PPropertiesOnEnter( + ze_device_get_p2_p_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetP2PPropertiesOnExit( + ze_device_get_p2_p_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetP2PProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetP2PProperties", + start_time, end_time); + } +} + +static void zeDeviceCanAccessPeerOnEnter( + ze_device_can_access_peer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceCanAccessPeerOnExit( + ze_device_can_access_peer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceCanAccessPeer", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceCanAccessPeer", + start_time, end_time); + } +} + +static void zeDeviceGetStatusOnEnter( + ze_device_get_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeDeviceGetStatusOnExit( + ze_device_get_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeDeviceGetStatus", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeDeviceGetStatus", + start_time, end_time); + } +} + +static void zeContextCreateOnEnter( + ze_context_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextCreateOnExit( + ze_context_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextCreate", + start_time, end_time); + } +} + +static void zeContextDestroyOnEnter( + ze_context_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextDestroyOnExit( + ze_context_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextDestroy", + start_time, end_time); + } +} + +static void zeContextGetStatusOnEnter( + ze_context_get_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextGetStatusOnExit( + ze_context_get_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextGetStatus", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextGetStatus", + start_time, end_time); + } +} + +static void zeContextSystemBarrierOnEnter( + ze_context_system_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextSystemBarrierOnExit( + ze_context_system_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextSystemBarrier", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextSystemBarrier", + start_time, end_time); + } +} + +static void zeContextMakeMemoryResidentOnEnter( + ze_context_make_memory_resident_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextMakeMemoryResidentOnExit( + ze_context_make_memory_resident_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextMakeMemoryResident", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextMakeMemoryResident", + start_time, end_time); + } +} + +static void zeContextEvictMemoryOnEnter( + ze_context_evict_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextEvictMemoryOnExit( + ze_context_evict_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextEvictMemory", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextEvictMemory", + start_time, end_time); + } +} + +static void zeContextMakeImageResidentOnEnter( + ze_context_make_image_resident_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextMakeImageResidentOnExit( + ze_context_make_image_resident_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextMakeImageResident", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextMakeImageResident", + start_time, end_time); + } +} + +static void zeContextEvictImageOnEnter( + ze_context_evict_image_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeContextEvictImageOnExit( + ze_context_evict_image_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeContextEvictImage", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeContextEvictImage", + start_time, end_time); + } +} + +static void zeCommandQueueCreateOnEnter( + ze_command_queue_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandQueueCreateOnExit( + ze_command_queue_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandQueueCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandQueueCreate", + start_time, end_time); + } +} + +static void zeCommandQueueDestroyOnEnter( + ze_command_queue_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandQueueDestroyOnExit( + ze_command_queue_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandQueueDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandQueueDestroy", + start_time, end_time); + } +} + +static void zeCommandQueueExecuteCommandListsOnEnter( + ze_command_queue_execute_command_lists_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandQueueExecuteCommandListsOnExit( + ze_command_queue_execute_command_lists_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandQueueExecuteCommandLists", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandQueueExecuteCommandLists", + start_time, end_time); + } +} + +static void zeCommandQueueSynchronizeOnEnter( + ze_command_queue_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandQueueSynchronizeOnExit( + ze_command_queue_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandQueueSynchronize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandQueueSynchronize", + start_time, end_time); + } +} + +static void zeCommandListCreateOnEnter( + ze_command_list_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListCreateOnExit( + ze_command_list_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListCreate", + start_time, end_time); + } +} + +static void zeCommandListCreateImmediateOnEnter( + ze_command_list_create_immediate_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListCreateImmediateOnExit( + ze_command_list_create_immediate_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListCreateImmediate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListCreateImmediate", + start_time, end_time); + } +} + +static void zeCommandListDestroyOnEnter( + ze_command_list_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListDestroyOnExit( + ze_command_list_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListDestroy", + start_time, end_time); + } +} + +static void zeCommandListCloseOnEnter( + ze_command_list_close_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListCloseOnExit( + ze_command_list_close_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListClose", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListClose", + start_time, end_time); + } +} + +static void zeCommandListResetOnEnter( + ze_command_list_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListResetOnExit( + ze_command_list_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListReset", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListReset", + start_time, end_time); + } +} + +static void zeCommandListAppendWriteGlobalTimestampOnEnter( + ze_command_list_append_write_global_timestamp_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendWriteGlobalTimestampOnExit( + ze_command_list_append_write_global_timestamp_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendWriteGlobalTimestamp", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendWriteGlobalTimestamp", + start_time, end_time); + } +} + +static void zeCommandListAppendBarrierOnEnter( + ze_command_list_append_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendBarrierOnExit( + ze_command_list_append_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendBarrier", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendBarrier", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryRangesBarrierOnEnter( + ze_command_list_append_memory_ranges_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryRangesBarrierOnExit( + ze_command_list_append_memory_ranges_barrier_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryRangesBarrier", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryRangesBarrier", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryCopyOnEnter( + ze_command_list_append_memory_copy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryCopyOnExit( + ze_command_list_append_memory_copy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryCopy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryCopy", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryFillOnEnter( + ze_command_list_append_memory_fill_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryFillOnExit( + ze_command_list_append_memory_fill_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryFill", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryFill", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryCopyRegionOnEnter( + ze_command_list_append_memory_copy_region_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryCopyRegionOnExit( + ze_command_list_append_memory_copy_region_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryCopyRegion", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryCopyRegion", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryCopyFromContextOnEnter( + ze_command_list_append_memory_copy_from_context_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryCopyFromContextOnExit( + ze_command_list_append_memory_copy_from_context_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryCopyFromContext", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryCopyFromContext", + start_time, end_time); + } +} + +static void zeCommandListAppendImageCopyOnEnter( + ze_command_list_append_image_copy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendImageCopyOnExit( + ze_command_list_append_image_copy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendImageCopy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendImageCopy", + start_time, end_time); + } +} + +static void zeCommandListAppendImageCopyRegionOnEnter( + ze_command_list_append_image_copy_region_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendImageCopyRegionOnExit( + ze_command_list_append_image_copy_region_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendImageCopyRegion", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendImageCopyRegion", + start_time, end_time); + } +} + +static void zeCommandListAppendImageCopyToMemoryOnEnter( + ze_command_list_append_image_copy_to_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendImageCopyToMemoryOnExit( + ze_command_list_append_image_copy_to_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendImageCopyToMemory", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendImageCopyToMemory", + start_time, end_time); + } +} + +static void zeCommandListAppendImageCopyFromMemoryOnEnter( + ze_command_list_append_image_copy_from_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendImageCopyFromMemoryOnExit( + ze_command_list_append_image_copy_from_memory_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendImageCopyFromMemory", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendImageCopyFromMemory", + start_time, end_time); + } +} + +static void zeCommandListAppendMemoryPrefetchOnEnter( + ze_command_list_append_memory_prefetch_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemoryPrefetchOnExit( + ze_command_list_append_memory_prefetch_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemoryPrefetch", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemoryPrefetch", + start_time, end_time); + } +} + +static void zeCommandListAppendMemAdviseOnEnter( + ze_command_list_append_mem_advise_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendMemAdviseOnExit( + ze_command_list_append_mem_advise_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendMemAdvise", time); + + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendMemAdvise", + start_time, end_time); + } +} + +static void zeCommandListAppendSignalEventOnEnter( + ze_command_list_append_signal_event_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendSignalEventOnExit( + ze_command_list_append_signal_event_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendSignalEvent", time); + + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendSignalEvent", + start_time, end_time); + } +} + +static void zeCommandListAppendWaitOnEventsOnEnter( + ze_command_list_append_wait_on_events_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendWaitOnEventsOnExit( + ze_command_list_append_wait_on_events_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendWaitOnEvents", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendWaitOnEvents", + start_time, end_time); + } +} + +static void zeCommandListAppendEventResetOnEnter( + ze_command_list_append_event_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendEventResetOnExit( + ze_command_list_append_event_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendEventReset", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendEventReset", + start_time, end_time); + } +} + +static void zeCommandListAppendQueryKernelTimestampsOnEnter( + ze_command_list_append_query_kernel_timestamps_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendQueryKernelTimestampsOnExit( + ze_command_list_append_query_kernel_timestamps_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendQueryKernelTimestamps", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendQueryKernelTimestamps", + start_time, end_time); + } +} + +static void zeCommandListAppendLaunchKernelOnEnter( + ze_command_list_append_launch_kernel_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendLaunchKernelOnExit( + ze_command_list_append_launch_kernel_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendLaunchKernel", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendLaunchKernel", + start_time, end_time); + } +} + +static void zeCommandListAppendLaunchCooperativeKernelOnEnter( + ze_command_list_append_launch_cooperative_kernel_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendLaunchCooperativeKernelOnExit( + ze_command_list_append_launch_cooperative_kernel_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendLaunchCooperativeKernel", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendLaunchCooperativeKernel", + start_time, end_time); + } +} + +static void zeCommandListAppendLaunchKernelIndirectOnEnter( + ze_command_list_append_launch_kernel_indirect_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendLaunchKernelIndirectOnExit( + ze_command_list_append_launch_kernel_indirect_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendLaunchKernelIndirect", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendLaunchKernelIndirect", + start_time, end_time); + } +} + +static void zeCommandListAppendLaunchMultipleKernelsIndirectOnEnter( + ze_command_list_append_launch_multiple_kernels_indirect_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeCommandListAppendLaunchMultipleKernelsIndirectOnExit( + ze_command_list_append_launch_multiple_kernels_indirect_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeCommandListAppendLaunchMultipleKernelsIndirect", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeCommandListAppendLaunchMultipleKernelsIndirect", + start_time, end_time); + } +} + +static void zeFenceCreateOnEnter( + ze_fence_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeFenceCreateOnExit( + ze_fence_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeFenceCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeFenceCreate", + start_time, end_time); + } +} + +static void zeFenceDestroyOnEnter( + ze_fence_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeFenceDestroyOnExit( + ze_fence_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeFenceDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeFenceDestroy", + start_time, end_time); + } +} + +static void zeFenceHostSynchronizeOnEnter( + ze_fence_host_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeFenceHostSynchronizeOnExit( + ze_fence_host_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeFenceHostSynchronize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeFenceHostSynchronize", + start_time, end_time); + } +} + +static void zeFenceQueryStatusOnEnter( + ze_fence_query_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeFenceQueryStatusOnExit( + ze_fence_query_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeFenceQueryStatus", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeFenceQueryStatus", + start_time, end_time); + } +} + +static void zeFenceResetOnEnter( + ze_fence_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeFenceResetOnExit( + ze_fence_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeFenceReset", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeFenceReset", + start_time, end_time); + } +} + +static void zeEventPoolCreateOnEnter( + ze_event_pool_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventPoolCreateOnExit( + ze_event_pool_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventPoolCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventPoolCreate", + start_time, end_time); + } +} + +static void zeEventPoolDestroyOnEnter( + ze_event_pool_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventPoolDestroyOnExit( + ze_event_pool_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventPoolDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventPoolDestroy", + start_time, end_time); + } +} + +static void zeEventPoolGetIpcHandleOnEnter( + ze_event_pool_get_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventPoolGetIpcHandleOnExit( + ze_event_pool_get_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventPoolGetIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventPoolGetIpcHandle", + start_time, end_time); + } +} + +static void zeEventPoolOpenIpcHandleOnEnter( + ze_event_pool_open_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventPoolOpenIpcHandleOnExit( + ze_event_pool_open_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventPoolOpenIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventPoolOpenIpcHandle", + start_time, end_time); + } +} + +static void zeEventPoolCloseIpcHandleOnEnter( + ze_event_pool_close_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventPoolCloseIpcHandleOnExit( + ze_event_pool_close_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventPoolCloseIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventPoolCloseIpcHandle", + start_time, end_time); + } +} + +static void zeEventCreateOnEnter( + ze_event_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventCreateOnExit( + ze_event_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventCreate", time); + + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventCreate", + start_time, end_time); + } +} + +static void zeEventDestroyOnEnter( + ze_event_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventDestroyOnExit( + ze_event_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventDestroy", + start_time, end_time); + } +} + +static void zeEventHostSignalOnEnter( + ze_event_host_signal_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventHostSignalOnExit( + ze_event_host_signal_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventHostSignal", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventHostSignal", + start_time, end_time); + } +} + +static void zeEventHostSynchronizeOnEnter( + ze_event_host_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventHostSynchronizeOnExit( + ze_event_host_synchronize_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventHostSynchronize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventHostSynchronize", + start_time, end_time); + } +} + +static void zeEventQueryStatusOnEnter( + ze_event_query_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventQueryStatusOnExit( + ze_event_query_status_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventQueryStatus", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventQueryStatus", + start_time, end_time); + } +} + +static void zeEventHostResetOnEnter( + ze_event_host_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventHostResetOnExit( + ze_event_host_reset_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventHostReset", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventHostReset", + start_time, end_time); + } +} + +static void zeEventQueryKernelTimestampOnEnter( + ze_event_query_kernel_timestamp_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeEventQueryKernelTimestampOnExit( + ze_event_query_kernel_timestamp_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeEventQueryKernelTimestamp", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeEventQueryKernelTimestamp", + start_time, end_time); + } +} + +static void zeImageGetPropertiesOnEnter( + ze_image_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeImageGetPropertiesOnExit( + ze_image_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeImageGetProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeImageGetProperties", + start_time, end_time); + } +} + +static void zeImageCreateOnEnter( + ze_image_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeImageCreateOnExit( + ze_image_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeImageCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeImageCreate", + start_time, end_time); + } +} + +static void zeImageDestroyOnEnter( + ze_image_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeImageDestroyOnExit( + ze_image_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeImageDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeImageDestroy", + start_time, end_time); + } +} + +static void zeModuleCreateOnEnter( + ze_module_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleCreateOnExit( + ze_module_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleCreate", + start_time, end_time); + } +} + +static void zeModuleDestroyOnEnter( + ze_module_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleDestroyOnExit( + ze_module_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleDestroy", + start_time, end_time); + } +} + +static void zeModuleDynamicLinkOnEnter( + ze_module_dynamic_link_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleDynamicLinkOnExit( + ze_module_dynamic_link_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleDynamicLink", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleDynamicLink", + start_time, end_time); + } +} + +static void zeModuleGetNativeBinaryOnEnter( + ze_module_get_native_binary_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleGetNativeBinaryOnExit( + ze_module_get_native_binary_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleGetNativeBinary", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleGetNativeBinary", + start_time, end_time); + } +} + +static void zeModuleGetGlobalPointerOnEnter( + ze_module_get_global_pointer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleGetGlobalPointerOnExit( + ze_module_get_global_pointer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleGetGlobalPointer", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleGetGlobalPointer", + start_time, end_time); + } +} + +static void zeModuleGetKernelNamesOnEnter( + ze_module_get_kernel_names_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleGetKernelNamesOnExit( + ze_module_get_kernel_names_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleGetKernelNames", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleGetKernelNames", + start_time, end_time); + } +} + +static void zeModuleGetPropertiesOnEnter( + ze_module_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleGetPropertiesOnExit( + ze_module_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleGetProperties", time); + + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleGetProperties", + start_time, end_time); + } +} + +static void zeModuleGetFunctionPointerOnEnter( + ze_module_get_function_pointer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleGetFunctionPointerOnExit( + ze_module_get_function_pointer_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleGetFunctionPointer", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleGetFunctionPointer", + start_time, end_time); + } +} + +static void zeModuleBuildLogDestroyOnEnter( + ze_module_build_log_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleBuildLogDestroyOnExit( + ze_module_build_log_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleBuildLogDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleBuildLogDestroy", + start_time, end_time); + } +} + +static void zeModuleBuildLogGetStringOnEnter( + ze_module_build_log_get_string_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeModuleBuildLogGetStringOnExit( + ze_module_build_log_get_string_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeModuleBuildLogGetString", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeModuleBuildLogGetString", + start_time, end_time); + } +} + +static void zeKernelCreateOnEnter( + ze_kernel_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelCreateOnExit( + ze_kernel_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelCreate", + start_time, end_time); + } +} + +static void zeKernelDestroyOnEnter( + ze_kernel_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelDestroyOnExit( + ze_kernel_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelDestroy", + start_time, end_time); + } +} + +static void zeKernelSetCacheConfigOnEnter( + ze_kernel_set_cache_config_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSetCacheConfigOnExit( + ze_kernel_set_cache_config_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSetCacheConfig", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSetCacheConfig", + start_time, end_time); + } +} + +static void zeKernelSetGroupSizeOnEnter( + ze_kernel_set_group_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSetGroupSizeOnExit( + ze_kernel_set_group_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSetGroupSize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSetGroupSize", + start_time, end_time); + } +} + +static void zeKernelSuggestGroupSizeOnEnter( + ze_kernel_suggest_group_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSuggestGroupSizeOnExit( + ze_kernel_suggest_group_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSuggestGroupSize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSuggestGroupSize", + start_time, end_time); + } +} + +static void zeKernelSuggestMaxCooperativeGroupCountOnEnter( + ze_kernel_suggest_max_cooperative_group_count_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSuggestMaxCooperativeGroupCountOnExit( + ze_kernel_suggest_max_cooperative_group_count_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSuggestMaxCooperativeGroupCount", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSuggestMaxCooperativeGroupCount", + start_time, end_time); + } +} + +static void zeKernelSetArgumentValueOnEnter( + ze_kernel_set_argument_value_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSetArgumentValueOnExit( + ze_kernel_set_argument_value_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSetArgumentValue", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSetArgumentValue", + start_time, end_time); + } +} + +static void zeKernelSetIndirectAccessOnEnter( + ze_kernel_set_indirect_access_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelSetIndirectAccessOnExit( + ze_kernel_set_indirect_access_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelSetIndirectAccess", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelSetIndirectAccess", + start_time, end_time); + } +} + +static void zeKernelGetIndirectAccessOnEnter( + ze_kernel_get_indirect_access_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelGetIndirectAccessOnExit( + ze_kernel_get_indirect_access_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelGetIndirectAccess", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelGetIndirectAccess", + start_time, end_time); + } +} + +static void zeKernelGetSourceAttributesOnEnter( + ze_kernel_get_source_attributes_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelGetSourceAttributesOnExit( + ze_kernel_get_source_attributes_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelGetSourceAttributes", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelGetSourceAttributes", + start_time, end_time); + } +} + +static void zeKernelGetPropertiesOnEnter( + ze_kernel_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelGetPropertiesOnExit( + ze_kernel_get_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelGetProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelGetProperties", + start_time, end_time); + } +} + +static void zeKernelGetNameOnEnter( + ze_kernel_get_name_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeKernelGetNameOnExit( + ze_kernel_get_name_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeKernelGetName", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeKernelGetName", + start_time, end_time); + } +} + +static void zeSamplerCreateOnEnter( + ze_sampler_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeSamplerCreateOnExit( + ze_sampler_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeSamplerCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeSamplerCreate", + start_time, end_time); + } +} + +static void zeSamplerDestroyOnEnter( + ze_sampler_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeSamplerDestroyOnExit( + ze_sampler_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeSamplerDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeSamplerDestroy", + start_time, end_time); + } +} + +static void zePhysicalMemCreateOnEnter( + ze_physical_mem_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zePhysicalMemCreateOnExit( + ze_physical_mem_create_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zePhysicalMemCreate", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zePhysicalMemCreate", + start_time, end_time); + } +} + +static void zePhysicalMemDestroyOnEnter( + ze_physical_mem_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zePhysicalMemDestroyOnExit( + ze_physical_mem_destroy_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zePhysicalMemDestroy", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zePhysicalMemDestroy", + start_time, end_time); + } +} + +static void zeMemAllocSharedOnEnter( + ze_mem_alloc_shared_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemAllocSharedOnExit( + ze_mem_alloc_shared_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemAllocShared", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemAllocShared", + start_time, end_time); + } +} + +static void zeMemAllocDeviceOnEnter( + ze_mem_alloc_device_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemAllocDeviceOnExit( + ze_mem_alloc_device_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemAllocDevice", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemAllocDevice", + start_time, end_time); + } +} + +static void zeMemAllocHostOnEnter( + ze_mem_alloc_host_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemAllocHostOnExit( + ze_mem_alloc_host_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemAllocHost", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemAllocHost", + start_time, end_time); + } +} + +static void zeMemFreeOnEnter( + ze_mem_free_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemFreeOnExit( + ze_mem_free_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemFree", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemFree", + start_time, end_time); + } +} + +static void zeMemGetAllocPropertiesOnEnter( + ze_mem_get_alloc_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemGetAllocPropertiesOnExit( + ze_mem_get_alloc_properties_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemGetAllocProperties", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemGetAllocProperties", + start_time, end_time); + } +} + +static void zeMemGetAddressRangeOnEnter( + ze_mem_get_address_range_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemGetAddressRangeOnExit( + ze_mem_get_address_range_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemGetAddressRange", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemGetAddressRange", + start_time, end_time); + } +} + +static void zeMemGetIpcHandleOnEnter( + ze_mem_get_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemGetIpcHandleOnExit( + ze_mem_get_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemGetIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemGetIpcHandle", + start_time, end_time); + } +} + +static void zeMemOpenIpcHandleOnEnter( + ze_mem_open_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemOpenIpcHandleOnExit( + ze_mem_open_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemOpenIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemOpenIpcHandle", + start_time, end_time); + } +} + +static void zeMemCloseIpcHandleOnEnter( + ze_mem_close_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeMemCloseIpcHandleOnExit( + ze_mem_close_ipc_handle_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeMemCloseIpcHandle", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeMemCloseIpcHandle", + start_time, end_time); + } +} + +static void zeVirtualMemReserveOnEnter( + ze_virtual_mem_reserve_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemReserveOnExit( + ze_virtual_mem_reserve_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemReserve", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemReserve", + start_time, end_time); + } +} + +static void zeVirtualMemFreeOnEnter( + ze_virtual_mem_free_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemFreeOnExit( + ze_virtual_mem_free_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemFree", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemFree", + start_time, end_time); + } +} + +static void zeVirtualMemQueryPageSizeOnEnter( + ze_virtual_mem_query_page_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemQueryPageSizeOnExit( + ze_virtual_mem_query_page_size_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemQueryPageSize", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemQueryPageSize", + start_time, end_time); + } +} + +static void zeVirtualMemMapOnEnter( + ze_virtual_mem_map_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemMapOnExit( + ze_virtual_mem_map_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemMap", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemMap", + start_time, end_time); + } +} + +static void zeVirtualMemUnmapOnEnter( + ze_virtual_mem_unmap_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemUnmapOnExit( + ze_virtual_mem_unmap_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemUnmap", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemUnmap", + start_time, end_time); + } +} + +static void zeVirtualMemSetAccessAttributeOnEnter( + ze_virtual_mem_set_access_attribute_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemSetAccessAttributeOnExit( + ze_virtual_mem_set_access_attribute_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemSetAccessAttribute", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemSetAccessAttribute", + start_time, end_time); + } +} + +static void zeVirtualMemGetAccessAttributeOnEnter( + ze_virtual_mem_get_access_attribute_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t& start_time = *reinterpret_cast(instance_user_data); + start_time = collector->GetTimestamp(); +} + +static void zeVirtualMemGetAccessAttributeOnExit( + ze_virtual_mem_get_access_attribute_params_t* params, + ze_result_t result, + void* global_user_data, + void** instance_user_data) { + PTI_ASSERT(global_user_data != nullptr); + ZeApiCollector* collector = + reinterpret_cast(global_user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast(instance_user_data); + PTI_ASSERT(start_time > 0); + PTI_ASSERT(start_time < end_time); + uint64_t time = end_time - start_time; + collector->AddFunctionTime("zeVirtualMemGetAccessAttribute", time); + + if (collector->callback_ != nullptr) { + collector->callback_(collector->callback_data_, + "zeVirtualMemGetAccessAttribute", + start_time, end_time); + } +} + +static void SetTracingFunctions(zel_tracer_handle_t tracer) { + zet_core_callbacks_t prologue = {}; + zet_core_callbacks_t epilogue = {}; + + prologue.Global.pfnInitCb = zeInitOnEnter; + epilogue.Global.pfnInitCb = zeInitOnExit; + prologue.Driver.pfnGetCb = zeDriverGetOnEnter; + epilogue.Driver.pfnGetCb = zeDriverGetOnExit; + prologue.Driver.pfnGetApiVersionCb = zeDriverGetApiVersionOnEnter; + epilogue.Driver.pfnGetApiVersionCb = zeDriverGetApiVersionOnExit; + prologue.Driver.pfnGetPropertiesCb = zeDriverGetPropertiesOnEnter; + epilogue.Driver.pfnGetPropertiesCb = zeDriverGetPropertiesOnExit; + prologue.Driver.pfnGetIpcPropertiesCb = zeDriverGetIpcPropertiesOnEnter; + epilogue.Driver.pfnGetIpcPropertiesCb = zeDriverGetIpcPropertiesOnExit; + prologue.Driver.pfnGetExtensionPropertiesCb = zeDriverGetExtensionPropertiesOnEnter; + epilogue.Driver.pfnGetExtensionPropertiesCb = zeDriverGetExtensionPropertiesOnExit; + prologue.Device.pfnGetCb = zeDeviceGetOnEnter; + epilogue.Device.pfnGetCb = zeDeviceGetOnExit; + prologue.Device.pfnGetSubDevicesCb = zeDeviceGetSubDevicesOnEnter; + epilogue.Device.pfnGetSubDevicesCb = zeDeviceGetSubDevicesOnExit; + prologue.Device.pfnGetPropertiesCb = zeDeviceGetPropertiesOnEnter; + epilogue.Device.pfnGetPropertiesCb = zeDeviceGetPropertiesOnExit; + prologue.Device.pfnGetComputePropertiesCb = zeDeviceGetComputePropertiesOnEnter; + epilogue.Device.pfnGetComputePropertiesCb = zeDeviceGetComputePropertiesOnExit; + prologue.Device.pfnGetModulePropertiesCb = zeDeviceGetModulePropertiesOnEnter; + epilogue.Device.pfnGetModulePropertiesCb = zeDeviceGetModulePropertiesOnExit; + prologue.Device.pfnGetCommandQueueGroupPropertiesCb = zeDeviceGetCommandQueueGroupPropertiesOnEnter; + epilogue.Device.pfnGetCommandQueueGroupPropertiesCb = zeDeviceGetCommandQueueGroupPropertiesOnExit; + prologue.Device.pfnGetMemoryPropertiesCb = zeDeviceGetMemoryPropertiesOnEnter; + epilogue.Device.pfnGetMemoryPropertiesCb = zeDeviceGetMemoryPropertiesOnExit; + prologue.Device.pfnGetMemoryAccessPropertiesCb = zeDeviceGetMemoryAccessPropertiesOnEnter; + epilogue.Device.pfnGetMemoryAccessPropertiesCb = zeDeviceGetMemoryAccessPropertiesOnExit; + prologue.Device.pfnGetCachePropertiesCb = zeDeviceGetCachePropertiesOnEnter; + epilogue.Device.pfnGetCachePropertiesCb = zeDeviceGetCachePropertiesOnExit; + prologue.Device.pfnGetImagePropertiesCb = zeDeviceGetImagePropertiesOnEnter; + epilogue.Device.pfnGetImagePropertiesCb = zeDeviceGetImagePropertiesOnExit; + prologue.Device.pfnGetExternalMemoryPropertiesCb = zeDeviceGetExternalMemoryPropertiesOnEnter; + epilogue.Device.pfnGetExternalMemoryPropertiesCb = zeDeviceGetExternalMemoryPropertiesOnExit; + prologue.Device.pfnGetP2PPropertiesCb = zeDeviceGetP2PPropertiesOnEnter; + epilogue.Device.pfnGetP2PPropertiesCb = zeDeviceGetP2PPropertiesOnExit; + prologue.Device.pfnCanAccessPeerCb = zeDeviceCanAccessPeerOnEnter; + epilogue.Device.pfnCanAccessPeerCb = zeDeviceCanAccessPeerOnExit; + prologue.Device.pfnGetStatusCb = zeDeviceGetStatusOnEnter; + epilogue.Device.pfnGetStatusCb = zeDeviceGetStatusOnExit; + prologue.Context.pfnCreateCb = zeContextCreateOnEnter; + epilogue.Context.pfnCreateCb = zeContextCreateOnExit; + prologue.Context.pfnDestroyCb = zeContextDestroyOnEnter; + epilogue.Context.pfnDestroyCb = zeContextDestroyOnExit; + prologue.Context.pfnGetStatusCb = zeContextGetStatusOnEnter; + epilogue.Context.pfnGetStatusCb = zeContextGetStatusOnExit; + prologue.Context.pfnSystemBarrierCb = zeContextSystemBarrierOnEnter; + epilogue.Context.pfnSystemBarrierCb = zeContextSystemBarrierOnExit; + prologue.Context.pfnMakeMemoryResidentCb = zeContextMakeMemoryResidentOnEnter; + epilogue.Context.pfnMakeMemoryResidentCb = zeContextMakeMemoryResidentOnExit; + prologue.Context.pfnEvictMemoryCb = zeContextEvictMemoryOnEnter; + epilogue.Context.pfnEvictMemoryCb = zeContextEvictMemoryOnExit; + prologue.Context.pfnMakeImageResidentCb = zeContextMakeImageResidentOnEnter; + epilogue.Context.pfnMakeImageResidentCb = zeContextMakeImageResidentOnExit; + prologue.Context.pfnEvictImageCb = zeContextEvictImageOnEnter; + epilogue.Context.pfnEvictImageCb = zeContextEvictImageOnExit; + prologue.CommandQueue.pfnCreateCb = zeCommandQueueCreateOnEnter; + epilogue.CommandQueue.pfnCreateCb = zeCommandQueueCreateOnExit; + prologue.CommandQueue.pfnDestroyCb = zeCommandQueueDestroyOnEnter; + epilogue.CommandQueue.pfnDestroyCb = zeCommandQueueDestroyOnExit; + prologue.CommandQueue.pfnExecuteCommandListsCb = zeCommandQueueExecuteCommandListsOnEnter; + epilogue.CommandQueue.pfnExecuteCommandListsCb = zeCommandQueueExecuteCommandListsOnExit; + prologue.CommandQueue.pfnSynchronizeCb = zeCommandQueueSynchronizeOnEnter; + epilogue.CommandQueue.pfnSynchronizeCb = zeCommandQueueSynchronizeOnExit; + prologue.CommandList.pfnCreateCb = zeCommandListCreateOnEnter; + epilogue.CommandList.pfnCreateCb = zeCommandListCreateOnExit; + prologue.CommandList.pfnCreateImmediateCb = zeCommandListCreateImmediateOnEnter; + epilogue.CommandList.pfnCreateImmediateCb = zeCommandListCreateImmediateOnExit; + prologue.CommandList.pfnDestroyCb = zeCommandListDestroyOnEnter; + epilogue.CommandList.pfnDestroyCb = zeCommandListDestroyOnExit; + prologue.CommandList.pfnCloseCb = zeCommandListCloseOnEnter; + epilogue.CommandList.pfnCloseCb = zeCommandListCloseOnExit; + prologue.CommandList.pfnResetCb = zeCommandListResetOnEnter; + epilogue.CommandList.pfnResetCb = zeCommandListResetOnExit; + prologue.CommandList.pfnAppendWriteGlobalTimestampCb = zeCommandListAppendWriteGlobalTimestampOnEnter; + epilogue.CommandList.pfnAppendWriteGlobalTimestampCb = zeCommandListAppendWriteGlobalTimestampOnExit; + prologue.CommandList.pfnAppendBarrierCb = zeCommandListAppendBarrierOnEnter; + epilogue.CommandList.pfnAppendBarrierCb = zeCommandListAppendBarrierOnExit; + prologue.CommandList.pfnAppendMemoryRangesBarrierCb = zeCommandListAppendMemoryRangesBarrierOnEnter; + epilogue.CommandList.pfnAppendMemoryRangesBarrierCb = zeCommandListAppendMemoryRangesBarrierOnExit; + prologue.CommandList.pfnAppendMemoryCopyCb = zeCommandListAppendMemoryCopyOnEnter; + epilogue.CommandList.pfnAppendMemoryCopyCb = zeCommandListAppendMemoryCopyOnExit; + prologue.CommandList.pfnAppendMemoryFillCb = zeCommandListAppendMemoryFillOnEnter; + epilogue.CommandList.pfnAppendMemoryFillCb = zeCommandListAppendMemoryFillOnExit; + prologue.CommandList.pfnAppendMemoryCopyRegionCb = zeCommandListAppendMemoryCopyRegionOnEnter; + epilogue.CommandList.pfnAppendMemoryCopyRegionCb = zeCommandListAppendMemoryCopyRegionOnExit; + prologue.CommandList.pfnAppendMemoryCopyFromContextCb = zeCommandListAppendMemoryCopyFromContextOnEnter; + epilogue.CommandList.pfnAppendMemoryCopyFromContextCb = zeCommandListAppendMemoryCopyFromContextOnExit; + prologue.CommandList.pfnAppendImageCopyCb = zeCommandListAppendImageCopyOnEnter; + epilogue.CommandList.pfnAppendImageCopyCb = zeCommandListAppendImageCopyOnExit; + prologue.CommandList.pfnAppendImageCopyRegionCb = zeCommandListAppendImageCopyRegionOnEnter; + epilogue.CommandList.pfnAppendImageCopyRegionCb = zeCommandListAppendImageCopyRegionOnExit; + prologue.CommandList.pfnAppendImageCopyToMemoryCb = zeCommandListAppendImageCopyToMemoryOnEnter; + epilogue.CommandList.pfnAppendImageCopyToMemoryCb = zeCommandListAppendImageCopyToMemoryOnExit; + prologue.CommandList.pfnAppendImageCopyFromMemoryCb = zeCommandListAppendImageCopyFromMemoryOnEnter; + epilogue.CommandList.pfnAppendImageCopyFromMemoryCb = zeCommandListAppendImageCopyFromMemoryOnExit; + prologue.CommandList.pfnAppendMemoryPrefetchCb = zeCommandListAppendMemoryPrefetchOnEnter; + epilogue.CommandList.pfnAppendMemoryPrefetchCb = zeCommandListAppendMemoryPrefetchOnExit; + prologue.CommandList.pfnAppendMemAdviseCb = zeCommandListAppendMemAdviseOnEnter; + epilogue.CommandList.pfnAppendMemAdviseCb = zeCommandListAppendMemAdviseOnExit; + prologue.CommandList.pfnAppendSignalEventCb = zeCommandListAppendSignalEventOnEnter; + epilogue.CommandList.pfnAppendSignalEventCb = zeCommandListAppendSignalEventOnExit; + prologue.CommandList.pfnAppendWaitOnEventsCb = zeCommandListAppendWaitOnEventsOnEnter; + epilogue.CommandList.pfnAppendWaitOnEventsCb = zeCommandListAppendWaitOnEventsOnExit; + prologue.CommandList.pfnAppendEventResetCb = zeCommandListAppendEventResetOnEnter; + epilogue.CommandList.pfnAppendEventResetCb = zeCommandListAppendEventResetOnExit; + prologue.CommandList.pfnAppendQueryKernelTimestampsCb = zeCommandListAppendQueryKernelTimestampsOnEnter; + epilogue.CommandList.pfnAppendQueryKernelTimestampsCb = zeCommandListAppendQueryKernelTimestampsOnExit; + prologue.CommandList.pfnAppendLaunchKernelCb = zeCommandListAppendLaunchKernelOnEnter; + epilogue.CommandList.pfnAppendLaunchKernelCb = zeCommandListAppendLaunchKernelOnExit; + prologue.CommandList.pfnAppendLaunchCooperativeKernelCb = zeCommandListAppendLaunchCooperativeKernelOnEnter; + epilogue.CommandList.pfnAppendLaunchCooperativeKernelCb = zeCommandListAppendLaunchCooperativeKernelOnExit; + prologue.CommandList.pfnAppendLaunchKernelIndirectCb = zeCommandListAppendLaunchKernelIndirectOnEnter; + epilogue.CommandList.pfnAppendLaunchKernelIndirectCb = zeCommandListAppendLaunchKernelIndirectOnExit; + prologue.CommandList.pfnAppendLaunchMultipleKernelsIndirectCb = zeCommandListAppendLaunchMultipleKernelsIndirectOnEnter; + epilogue.CommandList.pfnAppendLaunchMultipleKernelsIndirectCb = zeCommandListAppendLaunchMultipleKernelsIndirectOnExit; + prologue.Fence.pfnCreateCb = zeFenceCreateOnEnter; + epilogue.Fence.pfnCreateCb = zeFenceCreateOnExit; + prologue.Fence.pfnDestroyCb = zeFenceDestroyOnEnter; + epilogue.Fence.pfnDestroyCb = zeFenceDestroyOnExit; + prologue.Fence.pfnHostSynchronizeCb = zeFenceHostSynchronizeOnEnter; + epilogue.Fence.pfnHostSynchronizeCb = zeFenceHostSynchronizeOnExit; + prologue.Fence.pfnQueryStatusCb = zeFenceQueryStatusOnEnter; + epilogue.Fence.pfnQueryStatusCb = zeFenceQueryStatusOnExit; + prologue.Fence.pfnResetCb = zeFenceResetOnEnter; + epilogue.Fence.pfnResetCb = zeFenceResetOnExit; + prologue.EventPool.pfnCreateCb = zeEventPoolCreateOnEnter; + epilogue.EventPool.pfnCreateCb = zeEventPoolCreateOnExit; + prologue.EventPool.pfnDestroyCb = zeEventPoolDestroyOnEnter; + epilogue.EventPool.pfnDestroyCb = zeEventPoolDestroyOnExit; + prologue.EventPool.pfnGetIpcHandleCb = zeEventPoolGetIpcHandleOnEnter; + epilogue.EventPool.pfnGetIpcHandleCb = zeEventPoolGetIpcHandleOnExit; + prologue.EventPool.pfnOpenIpcHandleCb = zeEventPoolOpenIpcHandleOnEnter; + epilogue.EventPool.pfnOpenIpcHandleCb = zeEventPoolOpenIpcHandleOnExit; + prologue.EventPool.pfnCloseIpcHandleCb = zeEventPoolCloseIpcHandleOnEnter; + epilogue.EventPool.pfnCloseIpcHandleCb = zeEventPoolCloseIpcHandleOnExit; + prologue.Event.pfnCreateCb = zeEventCreateOnEnter; + epilogue.Event.pfnCreateCb = zeEventCreateOnExit; + prologue.Event.pfnDestroyCb = zeEventDestroyOnEnter; + epilogue.Event.pfnDestroyCb = zeEventDestroyOnExit; + prologue.Event.pfnHostSignalCb = zeEventHostSignalOnEnter; + epilogue.Event.pfnHostSignalCb = zeEventHostSignalOnExit; + prologue.Event.pfnHostSynchronizeCb = zeEventHostSynchronizeOnEnter; + epilogue.Event.pfnHostSynchronizeCb = zeEventHostSynchronizeOnExit; + prologue.Event.pfnQueryStatusCb = zeEventQueryStatusOnEnter; + epilogue.Event.pfnQueryStatusCb = zeEventQueryStatusOnExit; + prologue.Event.pfnHostResetCb = zeEventHostResetOnEnter; + epilogue.Event.pfnHostResetCb = zeEventHostResetOnExit; + prologue.Event.pfnQueryKernelTimestampCb = zeEventQueryKernelTimestampOnEnter; + epilogue.Event.pfnQueryKernelTimestampCb = zeEventQueryKernelTimestampOnExit; + prologue.Image.pfnGetPropertiesCb = zeImageGetPropertiesOnEnter; + epilogue.Image.pfnGetPropertiesCb = zeImageGetPropertiesOnExit; + prologue.Image.pfnCreateCb = zeImageCreateOnEnter; + epilogue.Image.pfnCreateCb = zeImageCreateOnExit; + prologue.Image.pfnDestroyCb = zeImageDestroyOnEnter; + epilogue.Image.pfnDestroyCb = zeImageDestroyOnExit; + prologue.Module.pfnCreateCb = zeModuleCreateOnEnter; + epilogue.Module.pfnCreateCb = zeModuleCreateOnExit; + prologue.Module.pfnDestroyCb = zeModuleDestroyOnEnter; + epilogue.Module.pfnDestroyCb = zeModuleDestroyOnExit; + prologue.Module.pfnDynamicLinkCb = zeModuleDynamicLinkOnEnter; + epilogue.Module.pfnDynamicLinkCb = zeModuleDynamicLinkOnExit; + prologue.Module.pfnGetNativeBinaryCb = zeModuleGetNativeBinaryOnEnter; + epilogue.Module.pfnGetNativeBinaryCb = zeModuleGetNativeBinaryOnExit; + prologue.Module.pfnGetGlobalPointerCb = zeModuleGetGlobalPointerOnEnter; + epilogue.Module.pfnGetGlobalPointerCb = zeModuleGetGlobalPointerOnExit; + prologue.Module.pfnGetKernelNamesCb = zeModuleGetKernelNamesOnEnter; + epilogue.Module.pfnGetKernelNamesCb = zeModuleGetKernelNamesOnExit; + prologue.Module.pfnGetPropertiesCb = zeModuleGetPropertiesOnEnter; + epilogue.Module.pfnGetPropertiesCb = zeModuleGetPropertiesOnExit; + prologue.Module.pfnGetFunctionPointerCb = zeModuleGetFunctionPointerOnEnter; + epilogue.Module.pfnGetFunctionPointerCb = zeModuleGetFunctionPointerOnExit; + prologue.ModuleBuildLog.pfnDestroyCb = zeModuleBuildLogDestroyOnEnter; + epilogue.ModuleBuildLog.pfnDestroyCb = zeModuleBuildLogDestroyOnExit; + prologue.ModuleBuildLog.pfnGetStringCb = zeModuleBuildLogGetStringOnEnter; + epilogue.ModuleBuildLog.pfnGetStringCb = zeModuleBuildLogGetStringOnExit; + prologue.Kernel.pfnCreateCb = zeKernelCreateOnEnter; + epilogue.Kernel.pfnCreateCb = zeKernelCreateOnExit; + prologue.Kernel.pfnDestroyCb = zeKernelDestroyOnEnter; + epilogue.Kernel.pfnDestroyCb = zeKernelDestroyOnExit; + prologue.Kernel.pfnSetCacheConfigCb = zeKernelSetCacheConfigOnEnter; + epilogue.Kernel.pfnSetCacheConfigCb = zeKernelSetCacheConfigOnExit; + prologue.Kernel.pfnSetGroupSizeCb = zeKernelSetGroupSizeOnEnter; + epilogue.Kernel.pfnSetGroupSizeCb = zeKernelSetGroupSizeOnExit; + prologue.Kernel.pfnSuggestGroupSizeCb = zeKernelSuggestGroupSizeOnEnter; + epilogue.Kernel.pfnSuggestGroupSizeCb = zeKernelSuggestGroupSizeOnExit; + prologue.Kernel.pfnSuggestMaxCooperativeGroupCountCb = zeKernelSuggestMaxCooperativeGroupCountOnEnter; + epilogue.Kernel.pfnSuggestMaxCooperativeGroupCountCb = zeKernelSuggestMaxCooperativeGroupCountOnExit; + prologue.Kernel.pfnSetArgumentValueCb = zeKernelSetArgumentValueOnEnter; + epilogue.Kernel.pfnSetArgumentValueCb = zeKernelSetArgumentValueOnExit; + prologue.Kernel.pfnSetIndirectAccessCb = zeKernelSetIndirectAccessOnEnter; + epilogue.Kernel.pfnSetIndirectAccessCb = zeKernelSetIndirectAccessOnExit; + prologue.Kernel.pfnGetIndirectAccessCb = zeKernelGetIndirectAccessOnEnter; + epilogue.Kernel.pfnGetIndirectAccessCb = zeKernelGetIndirectAccessOnExit; + prologue.Kernel.pfnGetSourceAttributesCb = zeKernelGetSourceAttributesOnEnter; + epilogue.Kernel.pfnGetSourceAttributesCb = zeKernelGetSourceAttributesOnExit; + prologue.Kernel.pfnGetPropertiesCb = zeKernelGetPropertiesOnEnter; + epilogue.Kernel.pfnGetPropertiesCb = zeKernelGetPropertiesOnExit; + prologue.Kernel.pfnGetNameCb = zeKernelGetNameOnEnter; + epilogue.Kernel.pfnGetNameCb = zeKernelGetNameOnExit; + prologue.Sampler.pfnCreateCb = zeSamplerCreateOnEnter; + epilogue.Sampler.pfnCreateCb = zeSamplerCreateOnExit; + prologue.Sampler.pfnDestroyCb = zeSamplerDestroyOnEnter; + epilogue.Sampler.pfnDestroyCb = zeSamplerDestroyOnExit; + prologue.PhysicalMem.pfnCreateCb = zePhysicalMemCreateOnEnter; + epilogue.PhysicalMem.pfnCreateCb = zePhysicalMemCreateOnExit; + prologue.PhysicalMem.pfnDestroyCb = zePhysicalMemDestroyOnEnter; + epilogue.PhysicalMem.pfnDestroyCb = zePhysicalMemDestroyOnExit; + prologue.Mem.pfnAllocSharedCb = zeMemAllocSharedOnEnter; + epilogue.Mem.pfnAllocSharedCb = zeMemAllocSharedOnExit; + prologue.Mem.pfnAllocDeviceCb = zeMemAllocDeviceOnEnter; + epilogue.Mem.pfnAllocDeviceCb = zeMemAllocDeviceOnExit; + prologue.Mem.pfnAllocHostCb = zeMemAllocHostOnEnter; + epilogue.Mem.pfnAllocHostCb = zeMemAllocHostOnExit; + prologue.Mem.pfnFreeCb = zeMemFreeOnEnter; + epilogue.Mem.pfnFreeCb = zeMemFreeOnExit; + prologue.Mem.pfnGetAllocPropertiesCb = zeMemGetAllocPropertiesOnEnter; + epilogue.Mem.pfnGetAllocPropertiesCb = zeMemGetAllocPropertiesOnExit; + prologue.Mem.pfnGetAddressRangeCb = zeMemGetAddressRangeOnEnter; + epilogue.Mem.pfnGetAddressRangeCb = zeMemGetAddressRangeOnExit; + prologue.Mem.pfnGetIpcHandleCb = zeMemGetIpcHandleOnEnter; + epilogue.Mem.pfnGetIpcHandleCb = zeMemGetIpcHandleOnExit; + prologue.Mem.pfnOpenIpcHandleCb = zeMemOpenIpcHandleOnEnter; + epilogue.Mem.pfnOpenIpcHandleCb = zeMemOpenIpcHandleOnExit; + prologue.Mem.pfnCloseIpcHandleCb = zeMemCloseIpcHandleOnEnter; + epilogue.Mem.pfnCloseIpcHandleCb = zeMemCloseIpcHandleOnExit; + prologue.VirtualMem.pfnReserveCb = zeVirtualMemReserveOnEnter; + epilogue.VirtualMem.pfnReserveCb = zeVirtualMemReserveOnExit; + prologue.VirtualMem.pfnFreeCb = zeVirtualMemFreeOnEnter; + epilogue.VirtualMem.pfnFreeCb = zeVirtualMemFreeOnExit; + prologue.VirtualMem.pfnQueryPageSizeCb = zeVirtualMemQueryPageSizeOnEnter; + epilogue.VirtualMem.pfnQueryPageSizeCb = zeVirtualMemQueryPageSizeOnExit; + prologue.VirtualMem.pfnMapCb = zeVirtualMemMapOnEnter; + epilogue.VirtualMem.pfnMapCb = zeVirtualMemMapOnExit; + prologue.VirtualMem.pfnUnmapCb = zeVirtualMemUnmapOnEnter; + epilogue.VirtualMem.pfnUnmapCb = zeVirtualMemUnmapOnExit; + prologue.VirtualMem.pfnSetAccessAttributeCb = zeVirtualMemSetAccessAttributeOnEnter; + epilogue.VirtualMem.pfnSetAccessAttributeCb = zeVirtualMemSetAccessAttributeOnExit; + prologue.VirtualMem.pfnGetAccessAttributeCb = zeVirtualMemGetAccessAttributeOnEnter; + epilogue.VirtualMem.pfnGetAccessAttributeCb = zeVirtualMemGetAccessAttributeOnExit; + + ze_result_t status = ZE_RESULT_SUCCESS; + status = zelTracerSetPrologues(tracer, &prologue); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + status = zelTracerSetEpilogues(tracer, &epilogue); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); +} + diff --git a/src/apex/L0/ze_api_collector.h b/src/apex/L0/ze_api_collector.h new file mode 100644 index 00000000..be42d555 --- /dev/null +++ b/src/apex/L0/ze_api_collector.h @@ -0,0 +1,203 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +struct ZeFunction { + uint64_t total_time; + uint64_t min_time; + uint64_t max_time; + uint64_t call_count; + + bool operator>(const ZeFunction& r) const { + if (total_time != r.total_time) { + return total_time > r.total_time; + } + return call_count > r.call_count; + } + + bool operator!=(const ZeFunction& r) const { + if (total_time == r.total_time) { + return call_count != r.call_count; + } + return true; + } +}; + + +typedef void (*OnFunctionFinishCallback)( + void* data, const std::string& name, + uint64_t started, uint64_t ended); + + +using ZeFunctionInfoMap = std::map; + +static void SetTracingFunctions(zel_tracer_handle_t tracer); + +class ZeApiCollector { + public: // User Interface + static ZeApiCollector* Create( + ze_driver_handle_t driver, + OnFunctionFinishCallback callback = nullptr, + void* callback_data = nullptr ) + { + + PTI_ASSERT(driver != nullptr); + + ze_context_handle_t context = utils::ze::GetContext(driver); + PTI_ASSERT(context != nullptr); + + ZeApiCollector* collector = new ZeApiCollector(context, callback, callback_data); + PTI_ASSERT(collector != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + zel_tracer_desc_t tracer_desc = { + ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr, collector}; + zel_tracer_handle_t tracer = nullptr; + + status = zelTracerCreate(&tracer_desc, &tracer); + if (status != ZE_RESULT_SUCCESS || tracer == nullptr) { + std::cerr << "[WARNING] Unable to create L0 tracer" << std::endl; + delete collector; + return nullptr; + } + + collector->tracer_ = tracer; + SetTracingFunctions(tracer); + + status = zelTracerSetEnabled(tracer, true); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + return collector; + } + + void DisableTracing() { + PTI_ASSERT(tracer_ != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + status = zelTracerSetEnabled(tracer_, false); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + const ZeFunctionInfoMap& GetFunctionInfoMap() const { + return function_info_map_; + } + + static void PrintFunctionsTable(const ZeFunctionInfoMap& function_info_map) { + std::set< std::pair, + utils::Comparator > sorted_list( + function_info_map.begin(), function_info_map.end()); + + uint64_t total_duration = 0; + size_t max_name_length = kFunctionLength; + for (auto& value : sorted_list) { + total_duration += value.second.total_time; + if (value.first.size() > max_name_length) { + max_name_length = value.first.size(); + } + } + + if (total_duration == 0) { + return; + } + + std::cerr << std::setw(max_name_length) << "Function" << "," << + std::setw(kCallsLength) << "Calls" << "," << + std::setw(kTimeLength) << "Time (ns)" << "," << + std::setw(kPercentLength) << "Time (%)" << "," << + std::setw(kTimeLength) << "Average (ns)" << "," << + std::setw(kTimeLength) << "Min (ns)" << "," << + std::setw(kTimeLength) << "Max (ns)" << std::endl; + + for (auto& value : sorted_list) { + const std::string& function = value.first; + uint64_t call_count = value.second.call_count; + uint64_t duration = value.second.total_time; + uint64_t avg_duration = duration / call_count; + uint64_t min_duration = value.second.min_time; + uint64_t max_duration = value.second.max_time; + float percent_duration = 100.0f * duration / total_duration; + std::cerr << std::setw(max_name_length) << function << "," << + std::setw(kCallsLength) << call_count << "," << + std::setw(kTimeLength) << duration << "," << + std::setw(kPercentLength) << std::setprecision(2) << + std::fixed << percent_duration << "," << + std::setw(kTimeLength) << avg_duration << "," << + std::setw(kTimeLength) << min_duration << "," << + std::setw(kTimeLength) << max_duration << std::endl; + } + } + + ~ZeApiCollector() { + if (tracer_ != nullptr) { + ze_result_t status = zelTracerDestroy(tracer_); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + } + + uint64_t GetTimestamp() const { + std::chrono::duration timestamp = + std::chrono::steady_clock::now() - base_time_; + return timestamp.count(); + } + + void AddFunctionTime(const std::string& name, uint64_t time) { + const std::lock_guard lock(lock_); + if (function_info_map_.count(name) == 0) { + function_info_map_[name] = {time, time, time, 1}; + } else { + ZeFunction& function = function_info_map_[name]; + function.total_time += time; + if (time < function.min_time) { + function.min_time = time; + } + if (time > function.max_time) { + function.max_time = time; + } + ++function.call_count; + } + } + + private: // Implementation Details + ZeApiCollector(ze_context_handle_t context, + OnFunctionFinishCallback callback, + void* callback_data) + : context_(context), + callback_(callback), + callback_data_(callback_data) { + PTI_ASSERT(context_ != nullptr); + } + +#include + + private: // Data + ze_context_handle_t context_ = nullptr; + zel_tracer_handle_t tracer_ = nullptr; + std::chrono::time_point base_time_; + + ZeFunctionInfoMap function_info_map_; + std::mutex lock_; + + static const uint32_t kFunctionLength = 10; + static const uint32_t kCallsLength = 12; + static const uint32_t kTimeLength = 20; + static const uint32_t kPercentLength = 10; + + OnFunctionFinishCallback callback_ = nullptr; + void* callback_data_ = nullptr; +}; + diff --git a/src/apex/L0/ze_kernel_collector.h b/src/apex/L0/ze_kernel_collector.h new file mode 100644 index 00000000..48991e14 --- /dev/null +++ b/src/apex/L0/ze_kernel_collector.h @@ -0,0 +1,726 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.hpp" + +struct ZeKernelCommand { + std::string name; + size_t simd_width; + ze_event_pool_handle_t event_pool; + ze_event_handle_t event; + uint64_t timer_frequency; +}; + +struct ZeKernelInfo { + uint64_t total_time; + uint64_t min_time; + uint64_t max_time; + uint64_t call_count; + size_t simd_width; + + bool operator>(const ZeKernelInfo& r) const { + if (total_time != r.total_time) { + return total_time > r.total_time; + } + return call_count > r.call_count; + } + + bool operator!=(const ZeKernelInfo& r) const { + if (total_time == r.total_time) { + return call_count != r.call_count; + } + return true; + } +}; + +struct ZeKernelInterval { + std::string name; + uint64_t start; + uint64_t end; +}; + +struct ZeCommandListInfo { + std::vector kernel_command_list; + ze_context_handle_t context; + ze_device_handle_t device; + bool immediate; +}; + +using ZeKernelInfoMap = std::map; +using ZeKernelIntervalList = std::vector; +using ZeCommandListMap = std::map; + +typedef void (*OnKernelFinishCallback)( + void* data, const std::string& name, + uint64_t started, uint64_t ended); + +class ZeKernelCollector { + public: // Interface + + static ZeKernelCollector* Create( + ze_driver_handle_t driver, + OnKernelFinishCallback callback = nullptr, + void* callback_data = nullptr) { + PTI_ASSERT(utils::ze::GetVersion() != ZE_API_VERSION_1_0); + + ze_context_handle_t context = utils::ze::GetContext(driver); + PTI_ASSERT(context != nullptr); + + ZeKernelCollector* collector = new ZeKernelCollector(context, callback, callback_data); + PTI_ASSERT(collector != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + zel_tracer_desc_t tracer_desc = { + ZEL_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr, collector}; + zel_tracer_handle_t tracer = nullptr; + status = zelTracerCreate(&tracer_desc, &tracer); + if (status != ZE_RESULT_SUCCESS) { + std::cerr << "[WARNING] Unable to create Level Zero tracer" << std::endl; + delete collector; + return nullptr; + } + + collector->EnableTracing(tracer); + return collector; + } + + static void PrintKernelsTable(const ZeKernelInfoMap& kernel_info_map) { + std::set< std::pair, + utils::Comparator > sorted_list( + kernel_info_map.begin(), kernel_info_map.end()); + + uint64_t total_duration = 0; + size_t max_name_length = kKernelLength; + for (auto& value : sorted_list) { + total_duration += value.second.total_time; + if (value.first.size() > max_name_length) { + max_name_length = value.first.size(); + } + } + + if (total_duration == 0) { + return; + } + + std::cerr << std::setw(max_name_length) << "Kernel" << "," << + std::setw(kCallsLength) << "Calls" << "," << + std::setw(kSimdLength) << "SIMD" << "," << + std::setw(kTimeLength) << "Time (ns)" << "," << + std::setw(kPercentLength) << "Time (%)" << "," << + std::setw(kTimeLength) << "Average (ns)" << "," << + std::setw(kTimeLength) << "Min (ns)" << "," << + std::setw(kTimeLength) << "Max (ns)" << std::endl; + + for (auto& value : sorted_list) { + const std::string& function = value.first; + uint64_t call_count = value.second.call_count; + size_t simd_width = value.second.simd_width; + uint64_t duration = value.second.total_time; + uint64_t avg_duration = duration / call_count; + uint64_t min_duration = value.second.min_time; + uint64_t max_duration = value.second.max_time; + float percent_duration = 100.0f * duration / total_duration; + std::cerr << std::setw(max_name_length) << function << "," << + std::setw(kCallsLength) << call_count << "," << + std::setw(kSimdLength) << simd_width << "," << + std::setw(kTimeLength) << duration << "," << + std::setw(kPercentLength) << std::setprecision(2) << + std::fixed << percent_duration << "," << + std::setw(kTimeLength) << avg_duration << "," << + std::setw(kTimeLength) << min_duration << "," << + std::setw(kTimeLength) << max_duration << std::endl; + } + } + + ~ZeKernelCollector() { + if (tracer_ != nullptr) { + ze_result_t status = zelTracerDestroy(tracer_); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + } + + void DisableTracing() { + PTI_ASSERT(tracer_ != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + status = zelTracerSetEnabled(tracer_, false); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + const ZeKernelInfoMap& GetKernelInfoMap() const { + return kernel_info_map_; + } + + const ZeKernelIntervalList& GetKernelIntervalList() const { + return kernel_interval_list_; + } + + private: // Implementation + + ZeKernelCollector(ze_context_handle_t context, + OnKernelFinishCallback callback, + void* callback_data): + context_(context), callback_(callback), callback_data_(callback_data) { + + PTI_ASSERT(context_ != nullptr); + } + + void EnableTracing(zel_tracer_handle_t tracer) { + PTI_ASSERT(tracer != nullptr); + tracer_ = tracer; + + zet_core_callbacks_t prologue_callbacks{}; + zet_core_callbacks_t epilogue_callbacks{}; + + prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy; + + prologue_callbacks.Event.pfnHostResetCb = OnEnterEventHostReset; + + prologue_callbacks.EventPool.pfnCreateCb = OnEnterEventPoolCreate; + epilogue_callbacks.EventPool.pfnCreateCb = OnExitEventPoolCreate; + + prologue_callbacks.CommandList.pfnAppendLaunchKernelCb = + OnEnterCommandListAppendLaunchKernel; + epilogue_callbacks.CommandList.pfnAppendLaunchKernelCb = + OnExitCommandListAppendLaunchKernel; + + epilogue_callbacks.CommandList.pfnCreateCb = + OnExitCommandListCreate; + epilogue_callbacks.CommandList.pfnCreateImmediateCb = + OnExitCommandListCreateImmediate; + epilogue_callbacks.CommandList.pfnDestroyCb = + OnExitCommandListDestroy; + epilogue_callbacks.CommandList.pfnResetCb = + OnExitCommandListReset; + + epilogue_callbacks.CommandQueue.pfnExecuteCommandListsCb = + OnExitCommandQueueExecuteCommandLists; + epilogue_callbacks.CommandQueue.pfnSynchronizeCb = + OnExitCommandQueueSynchronize; + epilogue_callbacks.CommandQueue.pfnDestroyCb = + OnExitCommandQueueDestroy; + + ze_result_t status = ZE_RESULT_SUCCESS; + status = zelTracerSetPrologues(tracer_, &prologue_callbacks); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + status = zelTracerSetEpilogues(tracer_, &epilogue_callbacks); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + status = zelTracerSetEnabled(tracer_, true); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + void AddKernelCommand( + ze_command_list_handle_t command_list, + const ZeKernelCommand* command) { + PTI_ASSERT(command_list != nullptr); + PTI_ASSERT(command != nullptr); + + const std::lock_guard lock(lock_); + + PTI_ASSERT(command_list_map_.count(command_list) == 1); + ZeCommandListInfo& info = command_list_map_[command_list]; + info.kernel_command_list.push_back(command); + } + + void AddKernelCall(const ZeKernelCommand* call) { + PTI_ASSERT(call != nullptr); + const std::lock_guard lock(lock_); + kernel_call_list_.push_back(call); + } + + void AddKernelCalls( + ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + + const std::lock_guard lock(lock_); + + PTI_ASSERT(command_list_map_.count(command_list) == 1); + ZeCommandListInfo& info = command_list_map_[command_list]; + PTI_ASSERT(!info.immediate); + + for (const ZeKernelCommand* command : info.kernel_command_list) { + kernel_call_list_.push_back(command); + } + } + + void ProcessCall(ze_event_handle_t event) { + PTI_ASSERT(event != nullptr); + + const std::lock_guard lock(lock_); + + for (auto it = kernel_call_list_.begin(); + it != kernel_call_list_.end(); ++it) { + const ZeKernelCommand* call = *it; + PTI_ASSERT(call != nullptr); + if (call->event == event) { + ProcessCall(call); + kernel_call_list_.erase(it); + break; + } + } + } + + void ProcessCall(const ZeKernelCommand* call) { + PTI_ASSERT(call != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + status = zeEventQueryStatus(call->event); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + ze_kernel_timestamp_result_t timestamp{}; + status = zeEventQueryKernelTimestamp(call->event, ×tamp); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + uint64_t start = timestamp.global.kernelStart; + uint64_t end = timestamp.global.kernelEnd; + uint64_t freq = call->timer_frequency; + PTI_ASSERT(freq > 0); + + uint64_t time = 0, start_ns = 0, end_ns = 0; + + start_ns = start * + static_cast(NSEC_IN_SEC) / freq; + if (start < end) { + end_ns = end * + static_cast(NSEC_IN_SEC) / freq; + } else { // 32-bit timer overflow + PTI_ASSERT(start < (1ULL << 32)); + end_ns = ((1ULL << 32) + end) * + static_cast(NSEC_IN_SEC) / freq; + } + time = end_ns - start_ns; + + AddKernelInfo(call->name, time, call->simd_width); + AddKernelInterval(call->name, start_ns, end_ns); + + if(callback_ != nullptr){ + callback_(callback_data_, call->name, + start_ns, end_ns); + } + + + } + + void ProcessCalls() { + ze_result_t status = ZE_RESULT_SUCCESS; + + const std::lock_guard lock(lock_); + + auto it = kernel_call_list_.begin(); + while (it != kernel_call_list_.end()) { + const ZeKernelCommand* call = *it; + PTI_ASSERT(call != nullptr); + PTI_ASSERT(call->event != nullptr); + status = zeEventQueryStatus(call->event); + if (status == ZE_RESULT_NOT_READY) { + ++it; + } else if (status == ZE_RESULT_SUCCESS) { + ProcessCall(call); + it = kernel_call_list_.erase(it); + } else { + PTI_ASSERT(0); + } + } + } + + void AddKernelInfo( + std::string name, uint64_t time, size_t simd_width) { + PTI_ASSERT(!name.empty()); + if (kernel_info_map_.count(name) == 0) { + kernel_info_map_[name] = {time, time, time, 1, simd_width}; + } else { + ZeKernelInfo& kernel = kernel_info_map_[name]; + kernel.total_time += time; + if (time > kernel.max_time) { + kernel.max_time = time; + } + if (time < kernel.min_time) { + kernel.min_time = time; + } + kernel.call_count += 1; + kernel.simd_width = std::max(kernel.simd_width, simd_width); + } + } + + void AddKernelInterval(std::string name, uint64_t start, uint64_t end) { + PTI_ASSERT(!name.empty()); + PTI_ASSERT(start < end); + kernel_interval_list_.push_back({name, start, end}); + } + + void AddCommandList( + ze_command_list_handle_t command_list, + ze_context_handle_t context, + ze_device_handle_t device, + bool immediate) { + PTI_ASSERT(command_list != nullptr); + PTI_ASSERT(context != nullptr); + const std::lock_guard lock(lock_); + PTI_ASSERT(command_list_map_.count(command_list) == 0); + command_list_map_[command_list] = { + std::vector(), context, device, immediate}; + } + + void RemoveKernelCommands(ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + + PTI_ASSERT(command_list_map_.count(command_list) == 1); + ZeCommandListInfo& info = command_list_map_[command_list]; + for (const ZeKernelCommand* command : info.kernel_command_list) { + if (command->event_pool != nullptr) { + ze_result_t status = ZE_RESULT_SUCCESS; + status = zeEventDestroy(command->event); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + status = zeEventPoolDestroy(command->event_pool); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + for (const ZeKernelCommand* call : kernel_call_list_) { + PTI_ASSERT(call != command); + } + + delete command; + } + info.kernel_command_list.clear(); + } + + void RemoveCommandList(ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + const std::lock_guard lock(lock_); + RemoveKernelCommands(command_list); + command_list_map_.erase(command_list); + } + + void ResetCommandList(ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + const std::lock_guard lock(lock_); + RemoveKernelCommands(command_list); + } + + ze_context_handle_t GetCommandListContext( + ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + const std::lock_guard lock(lock_); + PTI_ASSERT(command_list_map_.count(command_list) == 1); + return command_list_map_[command_list].context; + } + + ze_device_handle_t GetCommandListDevice( + ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + const std::lock_guard lock(lock_); + PTI_ASSERT(command_list_map_.count(command_list) == 1); + return command_list_map_[command_list].device; + } + + bool IsCommandListImmediate(ze_command_list_handle_t command_list) { + PTI_ASSERT(command_list != nullptr); + const std::lock_guard lock(lock_); + PTI_ASSERT(command_list_map_.count(command_list) == 1); + ZeCommandListInfo& command_list_info = command_list_map_[command_list]; + return command_list_info.immediate; + } + + private: // Callbacks + + static void OnEnterEventPoolCreate(ze_event_pool_create_params_t *params, + ze_result_t result, + void *global_data, + void **instance_data) { + const ze_event_pool_desc_t* desc = *(params->pdesc); + if (desc == nullptr) { + return; + } + if (desc->flags & ZE_EVENT_POOL_FLAG_IPC) { + return; + } + + ze_event_pool_desc_t* profiling_desc = new ze_event_pool_desc_t; + PTI_ASSERT(profiling_desc != nullptr); + profiling_desc->stype = desc->stype; + // PTI_ASSERT(profiling_desc->stype == ZE_STRUCTURE_TYPE_EVENT_POOL_DESC); + profiling_desc->pNext = desc->pNext; + profiling_desc->flags = desc->flags; + profiling_desc->flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + profiling_desc->flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + profiling_desc->count = desc->count; + + *(params->pdesc) = profiling_desc; + *instance_data = profiling_desc; + } + + static void OnExitEventPoolCreate(ze_event_pool_create_params_t *params, + ze_result_t result, + void *global_data, + void **instance_data) { + ze_event_pool_desc_t* desc = + static_cast(*instance_data); + if (desc != nullptr) { + delete desc; + } + } + + static void OnEnterEventDestroy(ze_event_destroy_params_t *params, + ze_result_t result, + void *global_data, + void **instance_data) { + if (*(params->phEvent) != nullptr) { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCall(*(params->phEvent)); + } + } + + static void OnEnterEventHostReset(ze_event_host_reset_params_t *params, + ze_result_t result, + void *global_data, + void **instance_data) { + if (*(params->phEvent) != nullptr) { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCall(*(params->phEvent)); + } + } + + static void CreateEvent(ze_context_handle_t context, + ze_event_pool_handle_t& event_pool, + ze_event_handle_t& event) { + PTI_ASSERT(context != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + + ze_event_pool_desc_t event_pool_desc = { + ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, + ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + 1}; + status = zeEventPoolCreate( + context, &event_pool_desc, 0, nullptr, &event_pool); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + ze_event_desc_t event_desc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, + ZE_EVENT_SCOPE_FLAG_HOST, ZE_EVENT_SCOPE_FLAG_HOST}; + zeEventCreate(event_pool, &event_desc, &event); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + static void OnEnterKernelAppend( + std::string name, size_t simd_width, + ze_event_handle_t& signal_event, ze_command_list_handle_t command_list, + void* global_data, void** instance_data) { + PTI_ASSERT(!name.empty()); + + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + + if (command_list == nullptr) { + return; + } + + ZeKernelCommand* command = new ZeKernelCommand; + PTI_ASSERT(command != nullptr); + + command->name = name; + command->simd_width = simd_width; + + ze_device_handle_t device = collector->GetCommandListDevice(command_list); + PTI_ASSERT(device != nullptr); + command->timer_frequency = utils::ze::GetDeviceTimerFrequency(device); + PTI_ASSERT(command->timer_frequency > 0); + + if (signal_event == nullptr) { + ze_context_handle_t context = + collector->GetCommandListContext(command_list); + CreateEvent(context, command->event_pool, command->event); + signal_event = command->event; + } else { + command->event_pool = nullptr; + command->event = signal_event; + } + + *instance_data = static_cast(command); + } + + static void OnEnterCommandListAppendLaunchKernel( + ze_command_list_append_launch_kernel_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + OnEnterKernelAppend( + utils::ze::GetKernelName(*(params->phKernel)), + utils::ze::GetKernelMaxSubgroupSize(*(params->phKernel)), + *(params->phSignalEvent), *(params->phCommandList), + global_data, instance_data); + } + + static void OnExitKernelAppend( + ze_command_list_handle_t command_list, + void* global_data, void** instance_data, + ze_result_t result) { + PTI_ASSERT(command_list != nullptr); + + ZeKernelCommand* command = + static_cast(*instance_data); + if (command == nullptr) { + return; + } + + if (result != ZE_RESULT_SUCCESS) { + if (command->event_pool != nullptr) { + ze_result_t status = ZE_RESULT_SUCCESS; + status = zeEventDestroy(command->event); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + status = zeEventPoolDestroy(command->event_pool); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + delete command; + } else { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->AddKernelCommand(command_list, command); + if (collector->IsCommandListImmediate(command_list)) { + collector->AddKernelCall(command); + } + } + } + + static void OnExitCommandListAppendLaunchKernel( + ze_command_list_append_launch_kernel_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + PTI_ASSERT(*(params->phSignalEvent) != nullptr); + OnExitKernelAppend(*params->phCommandList, global_data, + instance_data, result); + } + + static void OnExitCommandListCreate( + ze_command_list_create_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + PTI_ASSERT(**params->pphCommandList != nullptr); + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->AddCommandList( + **(params->pphCommandList), + *(params->phContext), + *(params->phDevice), + false); + } + } + + static void OnExitCommandListCreateImmediate( + ze_command_list_create_immediate_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + PTI_ASSERT(**params->pphCommandList != nullptr); + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->AddCommandList( + **(params->pphCommandList), + *(params->phContext), + *(params->phDevice), + true); + } + } + + static void OnExitCommandListDestroy( + ze_command_list_destroy_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + PTI_ASSERT(*params->phCommandList != nullptr); + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCalls(); + collector->RemoveCommandList(*params->phCommandList); + } + } + + static void OnExitCommandListReset( + ze_command_list_reset_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + PTI_ASSERT(*params->phCommandList != nullptr); + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCalls(); + collector->ResetCommandList(*params->phCommandList); + } + } + + static void OnExitCommandQueueExecuteCommandLists( + ze_command_queue_execute_command_lists_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + + uint32_t command_list_count = *params->pnumCommandLists; + ze_command_list_handle_t* command_lists = *params->pphCommandLists; + for (uint32_t i = 0; i < command_list_count; ++i) { + if (!collector->IsCommandListImmediate(command_lists[i])) { + collector->AddKernelCalls(command_lists[i]); + } + } + } + } + + static void OnExitCommandQueueSynchronize( + ze_command_queue_synchronize_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCalls(); + } + } + + static void OnExitCommandQueueDestroy( + ze_command_queue_destroy_params_t* params, + ze_result_t result, void* global_data, void** instance_data) { + if (result == ZE_RESULT_SUCCESS) { + ZeKernelCollector* collector = + reinterpret_cast(global_data); + PTI_ASSERT(collector != nullptr); + collector->ProcessCalls(); + } + } + + private: // Data + zel_tracer_handle_t tracer_ = nullptr; + ze_context_handle_t context_ = nullptr; + + OnKernelFinishCallback callback_ = nullptr; + void* callback_data_ = nullptr; + + std::mutex lock_; + ZeKernelInfoMap kernel_info_map_; + ZeKernelIntervalList kernel_interval_list_; + std::list kernel_call_list_; + ZeCommandListMap command_list_map_; + + static const uint32_t kKernelLength = 10; + static const uint32_t kCallsLength = 12; + static const uint32_t kSimdLength = 5; + static const uint32_t kTimeLength = 20; + static const uint32_t kPercentLength = 10; +}; + diff --git a/src/apex/L0/ze_utils.h b/src/apex/L0/ze_utils.h new file mode 100644 index 00000000..323b7fb5 --- /dev/null +++ b/src/apex/L0/ze_utils.h @@ -0,0 +1,392 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include "demangle.h" +#include "pti_assert.h" +#include "utils.h" + +namespace utils { + namespace ze { + + inline std::vector GetDriverList() { + ze_result_t status = ZE_RESULT_SUCCESS; + + uint32_t driver_count = 0; + status = zeDriverGet(&driver_count, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (driver_count == 0) { + return std::vector(); + } + + std::vector driver_list(driver_count); + status = zeDriverGet(&driver_count, driver_list.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + return driver_list; + } + + inline std::vector GetDeviceList(ze_driver_handle_t driver) { + PTI_ASSERT(driver != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + + uint32_t device_count = 0; + status = zeDeviceGet(driver, &device_count, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (device_count == 0) { + return std::vector(); + } + + std::vector device_list(device_count); + status = zeDeviceGet(driver, &device_count, device_list.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + return device_list; + } + + inline std::vector GetDeviceList() { + std::vector device_list; + for (auto driver : utils::ze::GetDriverList()) { + for (auto device : utils::ze::GetDeviceList(driver)) { + device_list.push_back(device); + } + } + return device_list; + } + + inline std::vector GetSubDeviceList( + ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + + uint32_t sub_device_count = 0; + status = zeDeviceGetSubDevices(device, &sub_device_count, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (sub_device_count == 0) { + return std::vector(); + } + + std::vector sub_device_list(sub_device_count); + status = zeDeviceGetSubDevices( + device, &sub_device_count, sub_device_list.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + return sub_device_list; + } + + inline ze_driver_handle_t GetGpuDriver() { + std::vector driver_list; + + for (auto driver : GetDriverList()) { + for (auto device : GetDeviceList(driver)) { + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + if (props.type == ZE_DEVICE_TYPE_GPU) { + driver_list.push_back(driver); + } + } + } + + if (driver_list.empty()) { + return nullptr; + } + + std::string value = utils::GetEnv("PTI_DEVICE_ID"); + uint32_t device_id = value.empty() ? 0 : std::stoul(value); + PTI_ASSERT(device_id >= 0 && device_id < driver_list.size()); + return driver_list[device_id]; + } + + inline ze_device_handle_t GetGpuDevice() { + std::vector device_list; + + for (auto driver : GetDriverList()) { + for (auto device : GetDeviceList(driver)) { + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + if (props.type == ZE_DEVICE_TYPE_GPU) { + device_list.push_back(device); + } + } + } + + if (device_list.empty()) { + return nullptr; + } + + std::string value = utils::GetEnv("PTI_DEVICE_ID"); + uint32_t device_id = value.empty() ? 0 : std::stoul(value); + PTI_ASSERT(device_id >= 0 && device_id < device_list.size()); + + std::vector sub_device_list = + GetSubDeviceList(device_list[device_id]); + if (sub_device_list.empty()) { + return device_list[device_id]; + } + + value = utils::GetEnv("PTI_SUB_DEVICE_ID"); + if (value.empty()) { + return device_list[device_id]; + } + + uint32_t sub_device_id = value.empty() ? 0 : std::stoul(value); + PTI_ASSERT(sub_device_id >= 0 && sub_device_id < sub_device_list.size()); + return sub_device_list[sub_device_id]; + } + + inline ze_context_handle_t GetContext(ze_driver_handle_t driver) { + PTI_ASSERT(driver != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + ze_context_handle_t context = nullptr; + ze_context_desc_t context_desc = { + ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0}; + + status = zeContextCreate(driver, &context_desc, &context); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return context; + } + + inline std::string GetDeviceName(ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_result_t status = ZE_RESULT_SUCCESS; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return props.name; + } + + inline int GetMetricId(zet_metric_group_handle_t group, std::string name) { + PTI_ASSERT(group != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + uint32_t metric_count = 0; + status = zetMetricGet(group, &metric_count, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (metric_count == 0) { + return -1; + } + + std::vector metric_list(metric_count, nullptr); + status = zetMetricGet(group, &metric_count, metric_list.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + int target = -1; + for (uint32_t i = 0; i < metric_count; ++i) { + zet_metric_properties_t metric_props{}; + status = zetMetricGetProperties(metric_list[i], &metric_props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (name == metric_props.name) { + target = i; + break; + } + } + + return target; + } + + inline zet_metric_group_handle_t FindMetricGroup( + ze_device_handle_t device, std::string name, + zet_metric_group_sampling_type_flag_t type) { + PTI_ASSERT(device != nullptr); + + ze_result_t status = ZE_RESULT_SUCCESS; + uint32_t group_count = 0; + status = zetMetricGroupGet(device, &group_count, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + if (group_count == 0) { + return nullptr; + } + + std::vector group_list(group_count, nullptr); + status = zetMetricGroupGet(device, &group_count, group_list.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + zet_metric_group_handle_t target = nullptr; + for (uint32_t i = 0; i < group_count; ++i) { + zet_metric_group_properties_t group_props{}; + group_props.stype = ZET_STRUCTURE_TYPE_METRIC_GROUP_PROPERTIES; + status = zetMetricGroupGetProperties(group_list[i], &group_props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + if (name == group_props.name && (group_props.samplingType & type)) { + target = group_list[i]; + break; + } + } + + return target; + } + + inline std::string GetResultType(zet_value_type_t type) { + switch (type) { + case ZET_VALUE_TYPE_UINT32: + return "UINT32"; + case ZET_VALUE_TYPE_UINT64: + return "UINT64"; + case ZET_VALUE_TYPE_FLOAT32: + return "FLOAT32"; + case ZET_VALUE_TYPE_FLOAT64: + return "FLOAT64"; + case ZET_VALUE_TYPE_BOOL8: + return "BOOL8"; + default: + break; + } + return "UNKNOWN"; + } + + inline std::string GetMetricType(zet_metric_type_t type) { + switch (type) { + case ZET_METRIC_TYPE_DURATION: + return "DURATION"; + case ZET_METRIC_TYPE_EVENT: + return "EVENT"; + case ZET_METRIC_TYPE_EVENT_WITH_RANGE: + return "EVENT_WITH_RANGE"; + case ZET_METRIC_TYPE_THROUGHPUT: + return "THROUGHPUT"; + case ZET_METRIC_TYPE_TIMESTAMP: + return "TIMESTAMP"; + case ZET_METRIC_TYPE_FLAG: + return "FLAG"; + case ZET_METRIC_TYPE_RATIO: + return "RATIO"; + case ZET_METRIC_TYPE_RAW: + return "RAW"; + default: + break; + } + return "UNKNOWN"; + } + + inline size_t GetKernelMaxSubgroupSize(ze_kernel_handle_t kernel) { + PTI_ASSERT(kernel != nullptr); + ze_kernel_properties_t props{}; + ze_result_t status = zeKernelGetProperties(kernel, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return props.maxSubgroupSize; + } + + inline std::string GetKernelName( + ze_kernel_handle_t kernel, bool demangle = false) { + PTI_ASSERT(kernel != nullptr); + + size_t size = 0; + ze_result_t status = zeKernelGetName(kernel, &size, nullptr); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + PTI_ASSERT(size > 0); + + std::vector name(size); + status = zeKernelGetName(kernel, &size, name.data()); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + PTI_ASSERT(name[size - 1] == '\0'); + + if (demangle) { + return utils::Demangle(name.data()); + } + return std::string(name.begin(), name.end() - 1); + } + + inline void GetDeviceTimestamps( + ze_device_handle_t device, + uint64_t* host_timestamp, + uint64_t* device_timestamp) { + PTI_ASSERT(device != nullptr); + PTI_ASSERT(host_timestamp != nullptr); + PTI_ASSERT(device_timestamp != nullptr); + ze_result_t status = zeDeviceGetGlobalTimestamps( + device, host_timestamp, device_timestamp); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + // TODO: use zeMetricGetGlobalTimestamps + inline void GetMetricTimestamps( + ze_device_handle_t device, + uint64_t* host_timestamp, + uint64_t* metric_timestamp) { + PTI_ASSERT(device != nullptr); + PTI_ASSERT(host_timestamp != nullptr); + PTI_ASSERT(metric_timestamp != nullptr); + ze_result_t status = zeDeviceGetGlobalTimestamps( + device, host_timestamp, metric_timestamp); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + } + + inline uint64_t GetDeviceTimerFrequency(ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return props.timerResolution; + } + + inline uint64_t GetMetricTimerFrequency(ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return props.timerResolution; + } + + inline uint64_t GetDeviceTimestampMask(ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return (1ull << props.kernelTimestampValidBits) - 1ull; + } + + inline uint64_t GetMetricTimestampMask(ze_device_handle_t device) { +#ifdef PTI_OA_TIMESTAMP_VALID_BITS + return (1ull << PTI_OA_TIMESTAMP_VALID_BITS) - 1ull; +#else + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return (1ull << props.kernelTimestampValidBits) - 1ull; +#endif + } + + inline ze_api_version_t GetDriverVersion(ze_driver_handle_t driver) { + PTI_ASSERT(driver != nullptr); + + ze_api_version_t version = ZE_API_VERSION_FORCE_UINT32; + ze_result_t status = zeDriverGetApiVersion(driver, &version); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + return version; + } + + inline ze_api_version_t GetVersion() { + auto driver_list = GetDriverList(); + if (driver_list.empty()) { + return ZE_API_VERSION_FORCE_UINT32; + } + return GetDriverVersion(driver_list.front()); + } + + } // namespace ze +} // namespace utils + diff --git a/src/apex/apex_level0.cpp b/src/apex/apex_level0.cpp new file mode 100644 index 00000000..189e424d --- /dev/null +++ b/src/apex/apex_level0.cpp @@ -0,0 +1,386 @@ +//============================================================== +// Copyright © 2020 Intel Corporation, 2022 University of Oregon +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "apex_api.hpp" +#include "apex.hpp" +#include "trace_event_listener.hpp" + +using namespace std; +using namespace apex; + +static ZeApiCollector* api_collector = nullptr; +static ZeKernelCollector* kernel_collector = nullptr; +static chrono::steady_clock::time_point start_time; +static int gpu_task_id = 0; +static int host_api_task_id = 0; +static uint64_t first_clock_timestamp; +static uint64_t first_cpu_timestamp; +static uint64_t first_gpu_timestamp; +static uint64_t cpu_delta = 0L; +static uint64_t gpu_delta = 0L; +static uint64_t last_gpu_timestamp = 0L; +static uint64_t gpu_offset = 0L; + +// External Tool Interface //////////////////////////////////////////////////// + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void Usage() { + cout << + "Usage: ./ze_hot_kernels[.exe] " << + endl; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +int ParseArgs(int argc, char* argv[]) { + return 1; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void SetToolEnv() { + utils::SetEnv("ZET_ENABLE_API_TRACING_EXP","1"); +} + +// Internal Tool Functionality //////////////////////////////////////////////// + +static void PrintResults() { + chrono::steady_clock::time_point end = chrono::steady_clock::now(); + chrono::duration time = end - start_time; + + PTI_ASSERT(kernel_collector != nullptr); + const ZeKernelInfoMap& kernel_info_map = kernel_collector->GetKernelInfoMap(); + if (kernel_info_map.size() == 0) { + return; + } + + uint64_t total_duration = 0; + for (auto& value : kernel_info_map) { + total_duration += value.second.total_time; + } + + cerr << endl; + cerr << "=== Device Timing Results: ===" << endl; + cerr << endl; + cerr << "Total Execution Time (ns): " << time.count() << endl; + cerr << "Total Device Time (ns): " << total_duration << endl; + cerr << endl; + + if (total_duration > 0) { + ZeKernelCollector::PrintKernelsTable(kernel_info_map); + } + + cerr << endl; +} + +// Internal Tool Functionality //////////////////////////////////////////////// + +static void APIPrintResults() { + chrono::steady_clock::time_point end = chrono::steady_clock::now(); + chrono::duration time = end - start_time; + + PTI_ASSERT(api_collector != nullptr); + const ZeFunctionInfoMap& function_info_map = api_collector->GetFunctionInfoMap(); + if (function_info_map.size() == 0) { + return; + } + + uint64_t total_duration = 0; + for (auto& value : function_info_map) { + total_duration += value.second.total_time; + } + + cerr << endl; + cerr << "=== API Timing Results: ===" << endl; + cerr << endl; + cerr << "Total Execution Time (ns): " << time.count() << endl; + cerr << "Total API Time (ns): " << total_duration << endl; + cerr << endl; + + if (total_duration > 0) { + ZeApiCollector::PrintFunctionsTable(function_info_map); + } + + std::cerr << std::endl; +} + +uint64_t TAUTranslateGPUTimestamp(uint64_t gpu_ts) { + // gpu_ts is in nanoseconds. + uint64_t new_ts = gpu_ts + gpu_delta; + return new_ts; +} + +uint64_t TAUTranslateCPUTimestamp(uint64_t cpu_ts) { + // cpu_ts is in nanoseconds. + uint64_t new_ts = cpu_ts + cpu_delta; + return new_ts; +} + +void TAUOnAPIFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { + uint64_t taskid; + + taskid = *((uint64_t *) data); + uint64_t started_translated = TAUTranslateCPUTimestamp(started); + uint64_t ended_translated = TAUTranslateCPUTimestamp(ended); + DEBUG_PRINT("APEX: OnAPIFinishCallback: (raw) name: %s started: %lu ended: %lu task id=%lu\n", + name.c_str(), started, ended, taskid); + DEBUG_PRINT("APEX: OnAPIFinishCallback: (translated) name: %s started: %lu ended: %lu task id=%lu\n", + name.c_str(), started_translated, ended_translated, taskid); + // We now need to start a timer on a task at the started_translated time and end at ended_translated + + // create a task_wrapper, as a child of the current timer + auto tt = new_task(name, UINT64_MAX, nullptr); + // create an APEX profiler to store this data - we can't start + // then stop because we have timestamps already. + auto prof = std::make_shared(tt); + prof->set_start(started_translated); + prof->set_end(ended_translated); + // important! Otherwise we might get the wrong end timestamp. + prof->stopped = true; + // Get the singleton APEX instance + static auto * instance = ::apex::apex::instance(); + // fake out the profiler_listener + instance->the_profiler_listener->push_profiler_public(prof); + // Handle tracing, if necessary + if (apex_options::use_trace_event()) { + trace_event_listener * tel = + (trace_event_listener*)instance->the_trace_event_listener; + tel->on_stop(prof); + } +#ifdef APEX_HAVE_OTF2 + if (apex_options::use_otf2()) { + otf2_listener * tol = + (otf2_listener*)instance->the_otf2_listener; + tol->on_start(prof); + tol->on_stop(prof); + } +#endif + // have the listeners handle the end of this task + instance->complete_task(tt); +} + +void store_profiler_data(const std::string &name, + uint64_t start, uint64_t end, level0_thread_node &node, + std::shared_ptr parent) { + in_apex prevent_deadlocks; + async_event_data as_data; + as_data.flow = false; + // create a task_wrapper, as a GPU child of the parent on the CPU side + auto tt = new_task(name, UINT64_MAX, parent); + // create an APEX profiler to store this data - we can't start + // then stop because we have timestamps already. + auto prof = std::make_shared(tt); + prof->set_start(start); + prof->set_end(end); + // important! Otherwise we might get the wrong end timestamp. + prof->stopped = true; + // Get the singleton APEX instance + static auto* instance = ::apex::apex::instance(); + // fake out the profiler_listener + instance->the_profiler_listener->push_profiler_public(prof); + // Handle tracing, if necessary + if (apex_options::use_trace_event()) { + trace_event_listener * tel = + (trace_event_listener*)instance->the_trace_event_listener; + tel->on_async_event(node, prof, as_data); + } +#ifdef APEX_HAVE_OTF2 + if (apex_options::use_otf2()) { + otf2_listener * tol = + (otf2_listener*)instance->the_otf2_listener; + tol->on_async_event(node, prof); + } +#endif + // have the listeners handle the end of this task + instance->complete_task(tt); +} + + +void TAUOnKernelFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { + + int taskid; + taskid = *((int *) data); + uint64_t started_translated = TAUTranslateGPUTimestamp(started); + uint64_t ended_translated = TAUTranslateGPUTimestamp(ended); + DEBUG_PRINT("APEX: : (raw) name: %s started: %lu ended: %lu task id=%d\n", + name.c_str(), started, ended, taskid); + DEBUG_PRINT("APEX: : (raw) name: %s started: %lu ended: %lu task id=%d\n", + name.c_str(), started_translated, ended_translated, taskid); + + last_gpu_timestamp = ended; + int device_num = 0; + int parent_thread = 0; + std::string demangled = demangle(name); + demangled = regex_replace(demangled, regex("typeinfo name for "), "GPU: "); + level0_thread_node node(device_num, parent_thread, APEX_ASYNC_KERNEL); + store_profiler_data(demangled, started_translated, ended_translated, node, nullptr); + + return; +} + + +// Internal Tool Interface //////////////////////////////////////////////////// + +void EnableProfiling() { + if (getenv("ZE_ENABLE_TRACING_LAYER") == NULL) { + // tau_exec -level_zero was not called. Perhaps it is using -opencl + DEBUG_PRINT("APEX: Disabling Level Zero support as ZE_ENABLE_TRACING_LAYER was not set from tau_exec -l0\n"); + return; + } + ze_result_t status = ZE_RESULT_SUCCESS; + status = zeInit(ZE_INIT_FLAG_GPU_ONLY); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + ze_driver_handle_t driver = nullptr; + ze_device_handle_t device = nullptr; + driver = utils::ze::GetGpuDriver(); + device = utils::ze::GetGpuDevice(); + + if (device == nullptr || driver == nullptr) { + std::cout << "[WARNING] Unable to find target device" << std::endl; + return; + } + + uint64_t *kernel_taskid = new uint64_t; + //TAU_CREATE_TASK(*kernel_taskid); + void *pk = (void *) kernel_taskid; + gpu_task_id = *kernel_taskid; + uint64_t *api_taskid = new uint64_t; + //*host_taskid = RtsLayer::myThread(); + //TAU_CREATE_TASK(*api_taskid); + host_api_task_id = *api_taskid; + kernel_collector = ZeKernelCollector::Create(driver, + TAUOnKernelFinishCallback, pk); + /* + //uint64_t gpu_ts = utils::i915::GetGpuTimestamp() & 0x0FFFFFFFF; + uint64_t gpu_ts = utils::i915::GetGpuTimestamp() ; + std::cout <<"TAU: Earliest GPU timestamp "<DisableTracing(); + //if (TauEnv_get_verbose()) + PrintResults(); + delete kernel_collector; + } + if (api_collector != nullptr) { + api_collector->DisableTracing(); + //if (TauEnv_get_verbose()) + APIPrintResults(); + delete api_collector; + } + //uint64_t gpu_end_ts = utils::i915::GetGpuTimestamp() & 0x0FFFFFFFF; + /* + uint64_t gpu_end_ts = utils::i915::GetGpuTimestamp(); + std::cout <<"APEX: Latest GPU timestamp "< chrono_dt = chrono_end - start_time; + DEBUG_PRINT("APEX: Diff (chrono) =%ld \n", chrono_dt.count()); +} + + +// preload.cc +#if defined(__gnu_linux__) + +#include + +typedef void (*Exit)(int status) __attribute__ ((noreturn)); +typedef int (*Main)(int argc, char** argv, char** envp); +typedef int (*Fini)(void); +typedef int (*LibcStartMain)(Main main, int argc, char** argv, Main init, + Fini fini, Fini rtld_fini, void *stack_end); + +// Pointer to original application main() function +Main original_main = nullptr; + +extern "C" int HookedMain(int argc, char **argv, char **envp) { + EnableProfiling(); + int return_code = original_main(argc, argv, envp); + DisableProfiling(); + return return_code; +} + +extern "C" int __libc_start_main(Main main, + int argc, + char** argv, + Main init, + Fini fini, + Fini rtld_fini, + void* stack_end) { + original_main = main; + LibcStartMain original = + (LibcStartMain)dlsym(RTLD_NEXT, "__libc_start_main"); + return original(HookedMain, argc, argv, init, fini, rtld_fini, stack_end); +} + +extern "C" void exit(int status) { + Exit original = (Exit)dlsym(RTLD_NEXT, "exit"); + DisableProfiling(); + original(status); +} + +#else +#error not supported +#endif + diff --git a/src/apex/async_thread_node.hpp b/src/apex/async_thread_node.hpp index 7bed190e..fcbd13f7 100644 --- a/src/apex/async_thread_node.hpp +++ b/src/apex/async_thread_node.hpp @@ -14,8 +14,14 @@ namespace apex { + /* This is the base thread node. Duh. What it does is translate a combination + * of device, context, stream, queue, command queue, thread, etc. to a virtual + * thread for the purposes of tracing. Because asynchronous GPU activity can + * overlap on the device, we need to make sure we have sufficient unqiqueness + * that we don't cause OTF2 to barf. */ class base_thread_node { public: + /* The base node has device, context, stream - which should handle all cases */ uint32_t _device; uint32_t _context; uint32_t _stream; @@ -56,6 +62,7 @@ namespace apex { } }; + /* The ompt node has device, thread, using the thread value in the stream */ class ompt_thread_node : public base_thread_node { public: ompt_thread_node(uint32_t device, uint32_t thread, @@ -75,6 +82,7 @@ namespace apex { }; + /* The ompt node has device, context, stream */ class cuda_thread_node : public base_thread_node { public: cuda_thread_node(uint32_t device, uint32_t context, uint32_t stream, @@ -95,6 +103,7 @@ namespace apex { } }; + /* The ompt node has device, command_queue, which is stored in the stream */ class hip_thread_node : public base_thread_node { public: hip_thread_node(uint32_t device, uint32_t command_queue, @@ -114,5 +123,25 @@ namespace apex { } }; + /* The level0 node has device, thread, using the thread value in the stream */ + class level0_thread_node : public base_thread_node { + public: + level0_thread_node(uint32_t device, uint32_t thread, + apex_async_activity_t activity) : + base_thread_node(device, 0, thread, activity) { } + virtual std::string name () { + std::stringstream ss; + ss << "GPU [" << _device << ":" << _stream << "]"; + std::string tmp{ss.str()}; + return tmp; + } + virtual uint32_t sortable_tid () { + uint32_t tid = ((_device+1) << 28); + tid = tid + _stream; + return tid; + } + }; + + } diff --git a/src/apex/utils.hpp b/src/apex/utils.hpp index 38460862..4a3cb733 100644 --- a/src/apex/utils.hpp +++ b/src/apex/utils.hpp @@ -23,6 +23,9 @@ #endif #include +#include "apex_types.h" +#include "apex_options.hpp" + #ifdef DEBUG #define DEBUG_PRINT(...) if (::apex::apex_options::use_verbose()) { \ fprintf( stderr, __VA_ARGS__ ); fflush(stderr); } @@ -30,8 +33,6 @@ fprintf( stderr, __VA_ARGS__ ); fflush(stderr); } #define DEBUG_PRINT(...) do{ } while ( false ) #endif -#include "apex_types.h" - namespace apex { bool starts_with(const std::string& input, const std::string& match); diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 6650bfba..8d993040 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -59,6 +59,7 @@ where APEX options are zero or more of: --apex:hip_driver enable HIP/ROCTracer KSA driver API callbacks (default: off) --apex:hip_details enable per-kernel statistics where available (default: off) --apex:monitor_gpu enable GPU monitoring services (CUDA NVML, ROCm SMI) + --apex:level0 enable OneAPI Level0 measurement (default: off) --apex:cpuinfo enable sampling of /proc/cpuinfo (Linux only) --apex:meminfo enable sampling of /proc/meminfo (Linux only) --apex:net enable sampling of /proc/net/dev (Linux only) @@ -99,6 +100,7 @@ hip=no hip_counters=no hip_driver=no hip_details=no +level0=no monitor_gpu=no untied=no cpuinfo=no @@ -236,6 +238,10 @@ while (( "$#" )); do export APEX_ENABLE_HIP=1 shift ;; + --apex:level0) + export APEX_ENABLE_LEVEL0=1 + shift + ;; --apex:hip_counters) hip_counters=yes export APEX_ENABLE_HIP=1 From 9a14556bbf9d76d40725c55319c07d54799e7e75 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 28 Jun 2022 14:06:48 -0700 Subject: [PATCH 02/13] Updating OneAPI support, still examining timestamps --- src/apex/apex_level0.cpp | 20 ++++++++++++++++++++ src/scripts/apex_exec | 1 + 2 files changed, 21 insertions(+) diff --git a/src/apex/apex_level0.cpp b/src/apex/apex_level0.cpp index 189e424d..dfc4033a 100644 --- a/src/apex/apex_level0.cpp +++ b/src/apex/apex_level0.cpp @@ -294,6 +294,26 @@ void EnableProfiling() { DEBUG_PRINT("APEX: Real CPU timestamp= %ld \n",first_clock_timestamp); DEBUG_PRINT("APEX: CPU delta= %ld \n",cpu_delta); DEBUG_PRINT("APEX: GPU delta= %ld \n",gpu_delta); + utils::ze::GetDeviceTimestamps(device, &first_cpu_timestamp, &first_gpu_timestamp); + first_clock_timestamp = profiler::now_ns(); + DEBUG_PRINT("APEX: Second CPU timestamp= %ld \n",first_cpu_timestamp); + DEBUG_PRINT("APEX: Second GPU timestamp= %ld \n",first_gpu_timestamp); + DEBUG_PRINT("APEX: Real CPU timestamp= %ld \n",first_clock_timestamp); + utils::ze::GetDeviceTimestamps(device, &first_cpu_timestamp, &first_gpu_timestamp); + first_clock_timestamp = profiler::now_ns(); + DEBUG_PRINT("APEX: Third CPU timestamp= %ld \n",first_cpu_timestamp); + DEBUG_PRINT("APEX: Third GPU timestamp= %ld \n",first_gpu_timestamp); + DEBUG_PRINT("APEX: Real CPU timestamp= %ld \n",first_clock_timestamp); + utils::ze::GetDeviceTimestamps(device, &first_cpu_timestamp, &first_gpu_timestamp); + first_clock_timestamp = profiler::now_ns(); + DEBUG_PRINT("APEX: Fourth CPU timestamp= %ld \n",first_cpu_timestamp); + DEBUG_PRINT("APEX: Fourth GPU timestamp= %ld \n",first_gpu_timestamp); + DEBUG_PRINT("APEX: Real CPU timestamp= %ld \n",first_clock_timestamp); + utils::ze::GetDeviceTimestamps(device, &first_cpu_timestamp, &first_gpu_timestamp); + first_clock_timestamp = profiler::now_ns(); + DEBUG_PRINT("APEX: Fifth CPU timestamp= %ld \n",first_cpu_timestamp); + DEBUG_PRINT("APEX: Fifth GPU timestamp= %ld \n",first_gpu_timestamp); + DEBUG_PRINT("APEX: Real CPU timestamp= %ld \n",first_clock_timestamp); // For API calls, we create a new task and trigger the start/stop based on its // timestamps. diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 8d993040..3063978f 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -240,6 +240,7 @@ while (( "$#" )); do ;; --apex:level0) export APEX_ENABLE_LEVEL0=1 + export ZE_ENABLE_TRACING_LAYER=1 shift ;; --apex:hip_counters) From db3bbe7001777e001c389b2aaa44fd3dcee4e5ef Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 24 Jan 2023 14:38:01 -0800 Subject: [PATCH 03/13] Debugging Level0 support and adding inclusive time for task lifetimes --- CMakeLists.txt | 3 ++ src/apex/apex.cpp | 16 ++++--- src/apex/apex_clock.hpp | 31 ++++++++++++ src/apex/apex_level0.cpp | 73 +++++++++++------------------ src/apex/apex_level0.hpp | 11 +++++ src/apex/apex_types.h | 1 + src/apex/dependency_tree.cpp | 21 ++++++--- src/apex/dependency_tree.hpp | 5 +- src/apex/proc_read_papi.cpp | 1 + src/apex/profile.hpp | 29 +++++++++--- src/apex/profile_reducer.cpp | 6 ++- src/apex/profiler.hpp | 62 ++++++++++++++---------- src/apex/profiler_listener.cpp | 24 +++++----- src/apex/task_wrapper.hpp | 10 ++-- src/apex/trace_event_listener.cpp | 2 +- src/openmp/ompt_target_matmult.c | 2 +- src/openmp/ompt_target_vector_add.c | 4 +- 17 files changed, 192 insertions(+), 109 deletions(-) create mode 100644 src/apex/apex_clock.hpp create mode 100644 src/apex/apex_level0.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a93cadf..da2eaa1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,10 +180,13 @@ if(APEX_WITH_LEVEL0) find_package(LEVEL0 REQUIRED) if (LEVEL0_FOUND) include_directories(${LEVEL0_INCLUDE_DIRS}) + add_definitions(-DAPEX_WITH_LEVEL0) set(LIBS ${LIBS} ${LEVEL0_LIBRARIES}) if (NOT BUILD_STATIC_EXECUTABLES) set (CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_RPATH} ${LEVEL0_LIBRARY_DIR}) endif() + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter -Rno-debug-disables-optimization") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Rno-debug-disables-optimization") endif() endif(APEX_WITH_LEVEL0) diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp index b72b2d89..2aba7af0 100644 --- a/src/apex/apex.cpp +++ b/src/apex/apex.cpp @@ -86,6 +86,10 @@ DEFINE_DESTRUCTOR(apex_finalize_static_void) #endif // HAS_CONSTRUCTORS #endif // APEX_HAVE_HPX +#ifdef APEX_WITH_LEVEL0 +#include "apex_level0.hpp" +#endif + #ifdef APEX_HAVE_TCMALLOC #include "tcmalloc_hooks.hpp" #endif @@ -547,6 +551,9 @@ uint64_t init(const char * thread_name, uint64_t comm_rank, #ifdef APEX_WITH_HIP init_hip_tracing(); #endif +#ifdef APEX_WITH_LEVEL0 + level0::EnableProfiling(); +#endif // Unset the LD_PRELOAD variable, because Active Harmony is going to // fork/execv a new session-core process, and we don't want APEX in @@ -689,8 +696,6 @@ profiler* start(const std::string &timer_name) return profiler::get_disabled_profiler(); } } - // save the start of the task, in case we need to make a flow event for tracing - tt_ptr->start_time = tt_ptr->prof->get_start_us(); // If we are allowing untied timers, clear the timer stack on this thread if (apex_options::untied_timers() == true) { new_profiler = thread_instance::instance().get_current_profiler(); @@ -756,8 +761,6 @@ profiler* start(const apex_function_address function_address) { return profiler::get_disabled_profiler(); } } - // save the start of the task, in case we need to make a flow event for tracing - tt_ptr->start_time = tt_ptr->prof->get_start_us(); // If we are allowing untied timers, clear the timer stack on this thread if (apex_options::untied_timers() == true) { new_profiler = thread_instance::instance().get_current_profiler(); @@ -826,8 +829,6 @@ void start(std::shared_ptr tt_ptr) { return; } } - // save the start of the task, in case we need to make a flow event for tracing - tt_ptr->start_time = tt_ptr->prof->get_start_us(); // If we are allowing untied timers, clear the timer stack on this thread if (apex_options::untied_timers() == true) { thread_instance::instance().clear_current_profiler(); @@ -1644,6 +1645,9 @@ std::string dump(bool reset) { #endif #ifdef APEX_WITH_HIP flush_hip_trace(); +#endif +#ifdef APEX_WITH_LEVEL0 + level0::DisableProfiling(); #endif if (_notify_listeners) { dump_event_data data(instance->get_node_id(), diff --git a/src/apex/apex_clock.hpp b/src/apex/apex_clock.hpp new file mode 100644 index 00000000..ff964465 --- /dev/null +++ b/src/apex/apex_clock.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014-2021 Kevin Huck + * Copyright (c) 2014-2021 University of Oregon + * + * Distributed under the Boost Software License, Version 1.0. (See accompanying + * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#pragma once + +#include +#define MYCLOCK std::chrono::system_clock + +namespace apex { + +class our_clock { +public: + // need this before the task_wrapper uses it. + static uint64_t time_point_to_nanoseconds(std::chrono::time_point tp) { + auto value = tp.time_since_epoch(); + uint64_t duration = + std::chrono::duration_cast(value).count(); + return duration; + } + static uint64_t now_ns() { + return time_point_to_nanoseconds(MYCLOCK::now()); + } +}; + +} // namespace + diff --git a/src/apex/apex_level0.cpp b/src/apex/apex_level0.cpp index dfc4033a..aa69646d 100644 --- a/src/apex/apex_level0.cpp +++ b/src/apex/apex_level0.cpp @@ -24,6 +24,9 @@ #include "apex_api.hpp" #include "apex.hpp" #include "trace_event_listener.hpp" +#if defined(APEX_WITH_PERFETTO) +#include "perfetto_listener.hpp" +#endif using namespace std; using namespace apex; @@ -71,6 +74,9 @@ void SetToolEnv() { // Internal Tool Functionality //////////////////////////////////////////////// +namespace apex { +namespace level0 { + static void PrintResults() { chrono::steady_clock::time_point end = chrono::steady_clock::now(); chrono::duration time = end - start_time; @@ -174,6 +180,14 @@ void TAUOnAPIFinishCallback(void *data, const std::string& name, uint64_t starte (trace_event_listener*)instance->the_trace_event_listener; tel->on_stop(prof); } +#if defined(APEX_WITH_PERFETTO) + if (apex_options::use_perfetto()) { + perfetto_listener * tel = + (perfetto_listener*)instance->the_perfetto_listener; + tel->on_start(tt); + tel->on_stop(prof); + } +#endif #ifdef APEX_HAVE_OTF2 if (apex_options::use_otf2()) { otf2_listener * tol = @@ -211,6 +225,13 @@ void store_profiler_data(const std::string &name, (trace_event_listener*)instance->the_trace_event_listener; tel->on_async_event(node, prof, as_data); } +#if defined(APEX_WITH_PERFETTO) + if (apex_options::use_perfetto()) { + perfetto_listener * tel = + (perfetto_listener*)instance->the_perfetto_listener; + tel->on_async_event(node, prof, as_data); + } +#endif #ifdef APEX_HAVE_OTF2 if (apex_options::use_otf2()) { otf2_listener * tol = @@ -325,16 +346,19 @@ void EnableProfiling() { } void DisableProfiling() { + static bool once{false}; + if (once) return; + once = true; if (kernel_collector != nullptr) { kernel_collector->DisableTracing(); //if (TauEnv_get_verbose()) - PrintResults(); + //PrintResults(); delete kernel_collector; } if (api_collector != nullptr) { api_collector->DisableTracing(); //if (TauEnv_get_verbose()) - APIPrintResults(); + //APIPrintResults(); delete api_collector; } //uint64_t gpu_end_ts = utils::i915::GetGpuTimestamp() & 0x0FFFFFFFF; @@ -359,48 +383,7 @@ void DisableProfiling() { DEBUG_PRINT("APEX: Diff (chrono) =%ld \n", chrono_dt.count()); } +} // namespace level0 +} // namespace apex -// preload.cc -#if defined(__gnu_linux__) - -#include - -typedef void (*Exit)(int status) __attribute__ ((noreturn)); -typedef int (*Main)(int argc, char** argv, char** envp); -typedef int (*Fini)(void); -typedef int (*LibcStartMain)(Main main, int argc, char** argv, Main init, - Fini fini, Fini rtld_fini, void *stack_end); - -// Pointer to original application main() function -Main original_main = nullptr; - -extern "C" int HookedMain(int argc, char **argv, char **envp) { - EnableProfiling(); - int return_code = original_main(argc, argv, envp); - DisableProfiling(); - return return_code; -} - -extern "C" int __libc_start_main(Main main, - int argc, - char** argv, - Main init, - Fini fini, - Fini rtld_fini, - void* stack_end) { - original_main = main; - LibcStartMain original = - (LibcStartMain)dlsym(RTLD_NEXT, "__libc_start_main"); - return original(HookedMain, argc, argv, init, fini, rtld_fini, stack_end); -} - -extern "C" void exit(int status) { - Exit original = (Exit)dlsym(RTLD_NEXT, "exit"); - DisableProfiling(); - original(status); -} - -#else -#error not supported -#endif diff --git a/src/apex/apex_level0.hpp b/src/apex/apex_level0.hpp new file mode 100644 index 00000000..a0c14af7 --- /dev/null +++ b/src/apex/apex_level0.hpp @@ -0,0 +1,11 @@ + +#pragma once + +namespace apex { +namespace level0 { + +void EnableProfiling(); +void DisableProfiling(); + +} +} \ No newline at end of file diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index af8f536a..07a07008 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -193,6 +193,7 @@ typedef struct _profile of samples collected for a counter */ double stops; /*!< Number of times a timer was yielded */ double accumulated; /*!< Accumulated values for all calls/samples */ + double inclusive_accumulated; /*!< Accumulated values task lifetimes */ double sum_squares; /*!< Running sum of squares calculation for all calls/samples */ double minimum; /*!< Minimum value seen by the timer or counter */ diff --git a/src/apex/dependency_tree.cpp b/src/apex/dependency_tree.cpp index 77e407bb..5b68a6d2 100644 --- a/src/apex/dependency_tree.cpp +++ b/src/apex/dependency_tree.cpp @@ -100,7 +100,9 @@ void Node::writeNode(std::ofstream& outfile, double total) { std::setfill('0') << std::setw(2) << std::hex << c->convert(c->green) << std::setfill('0') << std::setw(2) << std::hex << c->convert(c->blue) << "\"; depth=" << std::dec << depth << - "; time=" << std::fixed << acc << "; label=\"" << data->get_tree_name() << + "; time=" << std::fixed << acc << + "; inclusive=" << std::fixed << inclusive << + "; label=\"" << data->get_tree_name() << "\\lcalls: " << ncalls << "\\ltime: " << std::defaultfloat << acc << "\\l\" ];" << std::endl; @@ -140,7 +142,9 @@ double Node::writeNodeASCII(std::ofstream& outfile, double total, size_t indent) double stddev = sqrt(variance); outfile << " {min=" << std::fixed << std::setprecision(precision) << min << ", max=" << max << ", mean=" << mean << ", var=" << variance - << ", std dev=" << stddev << ", threads=" << thread_ids.size() << "} "; + << ", std dev=" << stddev + << ", inclusive=" << inclusive + << ", threads=" << thread_ids.size() << "} "; // Write out the name outfile << data->get_tree_name() << " "; // end the line @@ -204,7 +208,8 @@ double Node::writeNodeJSON(std::ofstream& outfile, double total, size_t indent) double ncalls = (calls == 0) ? 1 : calls; outfile << "\"metrics\": {\"time\": " << excl << ", \"total time (inc)\": " << acc - << ", \"time (inc)\": " << (acc / (double)(thread_ids.size())) + << ", \"time (inc cpu)\": " << (acc / (double)(thread_ids.size())) + << ", \"time (inc wall)\": " << inclusive << ", \"num threads\": " << thread_ids.size() << ", \"min (inc)\": " << min << ", \"max (inc)\": " << max @@ -288,7 +293,8 @@ void Node::writeTAUCallpath(std::ofstream& outfile, std::string prefix) { // write out exclusive outfile << std::fixed << std::setprecision(3) << remainder << " "; // write out inclusive - outfile << std::fixed << std::setprecision(3) << acc << " "; + //outfile << std::fixed << std::setprecision(3) << acc << " "; + outfile << std::fixed << std::setprecision(3) << inclusive << " "; // write out profilecalls and group outfile << "0 GROUP=\"" << data->get_group() << " | TAU_CALLPATH\" "; // end the line @@ -309,10 +315,13 @@ void Node::writeTAUCallpath(std::ofstream& outfile, std::string prefix) { return; } -void Node::addAccumulated(double value, bool is_resume, uint64_t thread_id) { +void Node::addAccumulated(double value, double incl, bool is_resume, uint64_t thread_id) { static std::mutex m; m.lock(); - if (!is_resume) { calls+=1; } + if (!is_resume) { + calls+=1; + inclusive = inclusive + incl; + } accumulated = accumulated + value; if (min == 0.0 || value < min) { min = value; } if (value > max) { max = value; } diff --git a/src/apex/dependency_tree.hpp b/src/apex/dependency_tree.hpp index eb459eda..f41db546 100644 --- a/src/apex/dependency_tree.hpp +++ b/src/apex/dependency_tree.hpp @@ -26,6 +26,7 @@ class Node { size_t count; double calls; double accumulated; + double inclusive; double min; double max; double sumsqr; @@ -37,7 +38,7 @@ class Node { public: Node(task_identifier* id, Node* p) : data(id), parent(p), count(1), calls(0), accumulated(0), - min(0), max(0), sumsqr(0), + inclusive(0), min(0), max(0), sumsqr(0), index(nodeCount.fetch_add(1, std::memory_order_relaxed)) { } ~Node() { @@ -54,7 +55,7 @@ class Node { size_t getCount() { return count; } size_t getCalls() { return calls; } double getAccumulated() { return accumulated; } - void addAccumulated(double value, bool is_resume, uint64_t thread_id); + void addAccumulated(double value, double incl, bool is_resume, uint64_t thread_id); size_t getIndex() { return index; }; std::string getName() { return data->get_name(); }; void writeNode(std::ofstream& outfile, double total); diff --git a/src/apex/proc_read_papi.cpp b/src/apex/proc_read_papi.cpp index 4c6fd915..7dc2b4a5 100644 --- a/src/apex/proc_read_papi.cpp +++ b/src/apex/proc_read_papi.cpp @@ -8,6 +8,7 @@ #include "papi.h" #define MAX_EVENTS_PER_EVENTSET 1024 +#include namespace apex { diff --git a/src/apex/profile.hpp b/src/apex/profile.hpp index e16209ee..53cb0251 100644 --- a/src/apex/profile.hpp +++ b/src/apex/profile.hpp @@ -34,7 +34,7 @@ class profile { std::mutex _mtx; std::set thread_ids; public: - profile(double initial, int num_metrics, double * papi_metrics, bool + profile(double initial, double inclusive, int num_metrics, double * papi_metrics, bool yielded = false, apex_profile_type type = APEX_TIMER) { memset(&(this->_profile), 0, sizeof(apex_profile)); _profile.type = type; @@ -45,6 +45,7 @@ class profile { } _profile.stops = 1.0; _profile.accumulated = initial; + _profile.inclusive_accumulated = inclusive; #if APEX_HAVE_PAPI for (int i = 0 ; i < num_metrics ; i++) { _profile.papi_metrics[i] = papi_metrics[i]; @@ -66,7 +67,7 @@ class profile { _profile.num_threads = 1; _profile.throttled = false; }; - profile(double initial, int num_metrics, double * papi_metrics, bool + profile(double initial, double inclusive, int num_metrics, double * papi_metrics, bool yielded, double allocations, double frees, double bytes_allocated, double bytes_freed) { _profile.type = APEX_TIMER; @@ -77,6 +78,7 @@ class profile { } _profile.stops = 1.0; _profile.accumulated = initial; + _profile.inclusive_accumulated = inclusive; #if APEX_HAVE_PAPI for (int i = 0 ; i < num_metrics ; i++) { _profile.papi_metrics[i] = papi_metrics[i]; @@ -103,10 +105,11 @@ class profile { profile(apex_profile * values) { memcpy(&_profile, values, sizeof(apex_profile)); } - void increment(double increase, int num_metrics, double * papi_metrics, + void increment(double increase, double inclusive, int num_metrics, double * papi_metrics, bool yielded, uint64_t thread_id) { _mtx.lock(); _profile.accumulated += increase; + _profile.inclusive_accumulated += inclusive; _profile.stops = _profile.stops + 1.0; #if APEX_HAVE_PAPI for (int i = 0 ; i < num_metrics ; i++) { @@ -129,10 +132,10 @@ class profile { _profile.num_threads = thread_ids.size(); _mtx.unlock(); } - void increment(double increase, int num_metrics, double * papi_metrics, + void increment(double increase, double inclusive, int num_metrics, double * papi_metrics, double allocations, double frees, double bytes_allocated, double bytes_freed, bool yielded, uint64_t thread_id) { - increment(increase, num_metrics, papi_metrics, yielded, thread_id); + increment(increase, inclusive, num_metrics, papi_metrics, yielded, thread_id); _mtx.lock(); _profile.allocations += allocations; _profile.frees += frees; @@ -173,6 +176,9 @@ class profile { double get_accumulated() { return _profile.accumulated; } + double get_inclusive_accumulated() { + return std::max(_profile.accumulated,_profile.inclusive_accumulated); + } double get_accumulated_mean_threads() { return (_profile.accumulated / (double)(_profile.num_threads)); } @@ -182,9 +188,20 @@ class profile { double get_accumulated_seconds() { return (get_accumulated() * 1.0e-9); } + double get_inclusive_accumulated_useconds() { + return (get_inclusive_accumulated() * 1.0e-3); + } + double get_inclusive_accumulated_seconds() { + return (get_inclusive_accumulated() * 1.0e-9); + } double * get_papi_metrics() { return (_profile.papi_metrics); } double get_minimum() { - return (_profile.minimum); + if (_profile.times_reset > 0) { + if (_profile.minimum == std::numeric_limits::max()) { + return 0.0; + } + } + return (std::max(_profile.minimum, 0.0)); } double get_maximum() { return (_profile.maximum); diff --git a/src/apex/profile_reducer.cpp b/src/apex/profile_reducer.cpp index 92276915..22b10927 100644 --- a/src/apex/profile_reducer.cpp +++ b/src/apex/profile_reducer.cpp @@ -15,10 +15,10 @@ #include #include -/* 10 values per timer/counter by default +/* 11 values per timer/counter by default * 4 values related to memory allocation tracking * 8 values (up to) when PAPI enabled */ -constexpr size_t num_fields{22}; +constexpr size_t num_fields{23}; #if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) #include "mpi.h" @@ -163,6 +163,7 @@ std::map reduce_profiles() { dptr[i++] = p->calls == 0.0 ? 1 : p->calls; dptr[i++] = p->stops == 0.0 ? 1 : p->stops; dptr[i++] = p->accumulated; + dptr[i++] = p->inclusive_accumulated; dptr[i++] = p->sum_squares; dptr[i++] = p->minimum; dptr[i++] = p->maximum; @@ -224,6 +225,7 @@ std::map reduce_profiles() { p->calls += dptr[index++]; p->stops += dptr[index++]; p->accumulated += dptr[index++]; + p->inclusive_accumulated += dptr[index++]; p->sum_squares += dptr[index++]; p->minimum = dptr[index] < p->minimum ? dptr[index] : p->minimum; index++; diff --git a/src/apex/profiler.hpp b/src/apex/profiler.hpp index 895f60d4..d4df0e1b 100644 --- a/src/apex/profiler.hpp +++ b/src/apex/profiler.hpp @@ -16,11 +16,11 @@ class profiler; #include #include #include +#include #include "apex_options.hpp" #include "apex_types.h" -// #include "apex_assert.h" -#include -#include +#include "apex_assert.h" +#include "apex_clock.hpp" #include "task_wrapper.hpp" namespace apex { @@ -37,8 +37,6 @@ class disabled_profiler_exception : public std::exception { } }; -#define MYCLOCK std::chrono::system_clock - class profiler { private: task_identifier * task_id; // for counters, timers @@ -75,7 +73,7 @@ class profiler { reset_type reset = reset_type::NONE) : task_id(task->get_task_id()), tt_ptr(task), - start_ns(now_ns()), + start_ns(our_clock::now_ns()), #if APEX_HAVE_PAPI papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, @@ -93,7 +91,7 @@ class profiler { reset_type reset = reset_type::NONE) : task_id(id), tt_ptr(nullptr), - start_ns(now_ns()), + start_ns(our_clock::now_ns()), #if APEX_HAVE_PAPI papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, @@ -109,7 +107,7 @@ class profiler { profiler(task_identifier * id, double value_) : task_id(id), tt_ptr(nullptr), - start_ns(now_ns()), + start_ns(our_clock::now_ns()), #if APEX_HAVE_PAPI papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, @@ -157,16 +155,16 @@ class profiler { } void stop(bool is_resume) { this->is_resume = is_resume; - end_ns = now_ns(); + end_ns = our_clock::now_ns(); stopped = true; }; void stop() { - end_ns = now_ns(); + end_ns = our_clock::now_ns(); stopped = true; }; void restart() { this->is_resume = true; - start_ns = now_ns(); + start_ns = our_clock::now_ns(); }; uint64_t get_start_ns() { return start_ns; @@ -187,11 +185,11 @@ class profiler { return end_ns*1.0e-6; } static double now_us( void ) { - double stamp = (double)now_ns(); + double stamp = (double)our_clock::now_ns(); return stamp*1.0e-3; } static double now_ms( void ) { - double stamp = (double)now_ns(); + double stamp = (double)our_clock::now_ns(); return stamp*1.0e-6; } double elapsed() { @@ -199,11 +197,33 @@ class profiler { return value; } else { if (!stopped) { - end_ns = now_ns(); + end_ns = our_clock::now_ns(); } return ((double)(end_ns-start_ns)); } } + double inclusive() { + if(is_counter) { return 0.0; } + /* The task isn't done yet. */ + if (is_resume) { + return (0.0); + } else { + /* Is this an asynchronous event? if so, the task lifetime + * will be after the profiler lifetime. So check for that. */ + double d_incl_start = (double)( + start_ns < tt_ptr->get_start_ns() ? + start_ns : tt_ptr->get_start_ns()); + double d_end = (double)(end_ns); + if (!stopped) { + d_end = (double)our_clock::now_ns(); + } + double incl = d_end - d_incl_start; + return (incl); + } + } + double inclusive_seconds() { + return inclusive() * 1.0e-9; + } double elapsed_us() { return elapsed() * 1.0e-3; } @@ -217,14 +237,8 @@ class profiler { return elapsed() - children_value; } - static uint64_t time_point_to_nanoseconds(std::chrono::time_point tp) { - auto value = tp.time_since_epoch(); - uint64_t duration = - std::chrono::duration_cast(value).count(); - return duration; - } static uint64_t now_ns() { - return time_point_to_nanoseconds(MYCLOCK::now()); + return our_clock::now_ns(); } static profiler* get_disabled_profiler(void) { @@ -239,16 +253,16 @@ class profiler { * We want a timestamp for the start of the trace. * We will also need one for the end of the trace. */ static uint64_t get_global_start(void) { - static uint64_t global_now = now_ns(); + static uint64_t global_now = our_clock::now_ns(); return global_now; } /* this is for getting the endpoint of the trace. */ static uint64_t get_global_end(void) { - return now_ns(); + return our_clock::now_ns(); } double normalized_timestamp(void) { if(is_counter) { - return now_ns() - get_global_start(); + return our_clock::now_ns() - get_global_start(); } else { return start_ns - get_global_start(); } diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index ba551991..8eab1ea0 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -187,7 +187,7 @@ std::unordered_set free_profiles; fmin(hardware_concurrency(), num_worker_threads); double elapsed = total_main - non_idle_time; elapsed = elapsed > 0.0 ? elapsed : 0.0; - profile * theprofile = new profile(elapsed, 0, nullptr, false); + profile * theprofile = new profile(elapsed, 0, 0, nullptr, false); { std::unique_lock l(free_profile_set_mutex); free_profiles.insert(theprofile); @@ -206,7 +206,7 @@ std::unordered_set free_profiles; fmin(hardware_concurrency(), num_worker_threads); double elapsed = total_main - non_idle_time; double rate = elapsed > 0.0 ? ((elapsed/total_main)) : 0.0; - profile * theprofile = new profile(rate, 0, nullptr, false); + profile * theprofile = new profile(rate, 0, 0, nullptr, false); { std::unique_lock l(free_profile_set_mutex); free_profiles.insert(theprofile); @@ -231,7 +231,7 @@ std::unordered_set free_profiles; } else if (id.name == string(APEX_IDLE_TIME)) { return get_idle_time(); } else if (id.name == string(APEX_NON_IDLE_TIME)) { - profile * theprofile = new profile(get_non_idle_time(), 0, nullptr, false); + profile * theprofile = new profile(get_non_idle_time(), 0, 0, nullptr, false); { std::unique_lock l(free_profile_set_mutex); free_profiles.insert(theprofile); @@ -300,11 +300,11 @@ std::unordered_set free_profiles; theprofile->reset(); } else { if (apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) { - theprofile->increment(p.elapsed(), tmp_num_counters, + theprofile->increment(p.elapsed(), p.inclusive(), tmp_num_counters, values, p.allocations, p.frees, p.bytes_allocated, p.bytes_freed, p.is_resume, p.thread_id); } else { - theprofile->increment(p.elapsed(), tmp_num_counters, + theprofile->increment(p.elapsed(), p.inclusive(), tmp_num_counters, values, p.is_resume, p.thread_id); } } @@ -348,14 +348,14 @@ std::unordered_set free_profiles; if ((apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) && !p.is_counter) { theprofile = new profile(p.is_reset == - reset_type::CURRENT ? 0.0 : p.elapsed(), + reset_type::CURRENT ? 0.0 : p.elapsed(), p.inclusive(), tmp_num_counters, values, p.is_resume, p.allocations, p.frees, p.bytes_allocated, p.bytes_freed); task_map[*(p.get_task_id())] = theprofile; } else { theprofile = new profile(p.is_reset == - reset_type::CURRENT ? 0.0 : p.elapsed(), + reset_type::CURRENT ? 0.0 : p.elapsed(), p.inclusive(), tmp_num_counters, values, p.is_resume, p.is_counter ? APEX_COUNTER : APEX_TIMER); task_map[*(p.get_task_id())] = theprofile; @@ -422,7 +422,7 @@ std::unordered_set free_profiles; } } if (apex_options::use_tasktree_output() && !p.is_counter && p.tt_ptr != nullptr) { - p.tt_ptr->tree_node->addAccumulated(p.elapsed_seconds(), p.is_resume, p.thread_id); + p.tt_ptr->tree_node->addAccumulated(p.elapsed_seconds(), p.inclusive_seconds(), p.is_resume, p.thread_id); } return 1; } @@ -548,6 +548,7 @@ std::unordered_set free_profiles; csv_output << std::llround(p->get_maximum()) << ","; csv_output << std::llround(p->get_stddev()) << ","; csv_output << std::llround(p->get_accumulated_useconds()) << ","; + csv_output << std::llround(p->get_inclusive_accumulated_useconds()) << ","; csv_output << std::llround(p->get_num_threads()) << ","; csv_output << std::llround(p->get_accumulated_useconds()/p->get_num_threads()); //screen_output << " --n/a-- " ; @@ -668,7 +669,7 @@ std::unordered_set free_profiles; csv_output << std::llround(p->get_maximum()) << ","; csv_output << std::llround(p->get_stddev()) << ","; // add all the extra columns for timer data - csv_output << "0,0,0"; // accumulated seconds, threads, per thread - meaningless + csv_output << "0,0,0,0"; // accumulated seconds, inclusive, threads, per thread - meaningless #if APEX_HAVE_PAPI for (int i = 0 ; i < num_papi_counters ; i++) { csv_output << ",0"; @@ -777,7 +778,7 @@ std::unordered_set free_profiles; } } csv_output << "\"name\",\"type\",\"num samples/calls\",\"minimum\",\"mean\"," - << "\"maximum\",\"stddev\",\"total microseconds\",\"num threads\",\"total per thread\""; + << "\"maximum\",\"stddev\",\"total microseconds\",\"inclusive microseconds\",\"num threads\",\"total per thread\""; #if APEX_HAVE_PAPI for (int i = 0 ; i < num_papi_counters ; i++) { csv_output << ",\"" << metric_names[i] << "\""; @@ -1116,6 +1117,7 @@ std::unordered_set free_profiles; "\"; label=\"" << task_id.get_tree_name() << "\\lcalls: " << p->get_calls() << "\\ltime: " << accumulated << + "s\\linclusive: " << p->get_inclusive_accumulated_seconds() << "s\\l" << divided_label << accumulated/divisor << "s\\l\" ];" << std::endl; delete(c); @@ -1137,7 +1139,7 @@ std::unordered_set free_profiles; void format_line(ofstream &myfile, profile * p, task_identifier& task_id) { myfile << p->get_calls() << " "; myfile << 0 << " "; - myfile << ((p->get_accumulated_useconds())) << " "; + myfile << ((p->get_inclusive_accumulated_useconds())) << " "; myfile << ((p->get_accumulated_useconds())) << " "; myfile << 0 << " "; myfile << get_TAU_group(task_id); diff --git a/src/apex/task_wrapper.hpp b/src/apex/task_wrapper.hpp index 703a0ffe..78df1eca 100644 --- a/src/apex/task_wrapper.hpp +++ b/src/apex/task_wrapper.hpp @@ -23,6 +23,7 @@ struct task_wrapper; #include #include #include "dependency_tree.hpp" +#include "apex_clock.hpp" namespace apex { @@ -74,9 +75,9 @@ struct task_wrapper { */ long unsigned int thread_id; /** - \brief Time (in microseconds) when this task was started, if started + \brief Time (in microseconds) when this task was created */ - double start_time; + uint64_t start_ns; /** \brief Whether this event requires separate start/end events in gtrace */ @@ -93,7 +94,7 @@ struct task_wrapper { tree_node(nullptr), alias(nullptr), thread_id(0UL), - start_time(0), + start_ns(our_clock::now_ns()), explicit_trace_start(false) { } /** @@ -133,6 +134,9 @@ struct task_wrapper { // make/find a node for ourselves tree_node = parent->tree_node->replaceChild(task_id, alias); } + uint64_t get_start_ns() { + return start_ns; + } }; // struct task_wrapper } // namespace apex diff --git a/src/apex/trace_event_listener.cpp b/src/apex/trace_event_listener.cpp index 473b320e..b453632f 100644 --- a/src/apex/trace_event_listener.cpp +++ b/src/apex/trace_event_listener.cpp @@ -187,7 +187,7 @@ inline void trace_event_listener::_common_stop(std::shared_ptr &p) { if (p->tt_ptr->parent != nullptr && p->tt_ptr->parent->thread_id != tid) { //std::cout << "FLOWING!" << std::endl; uint64_t flow_id = get_flow_id(); - write_flow_event(ss, p->tt_ptr->parent->start_time, 's', "ControlFlow", flow_id, + write_flow_event(ss, p->tt_ptr->parent->get_start_ns(), 's', "ControlFlow", flow_id, saved_node_id, p->tt_ptr->parent->thread_id, p->tt_ptr->parent->task_id->get_name()); write_flow_event(ss, p->get_start_us(), 'f', "ControlFlow", flow_id, saved_node_id, tid, p->tt_ptr->parent->task_id->get_name()); diff --git a/src/openmp/ompt_target_matmult.c b/src/openmp/ompt_target_matmult.c index cb0e49e9..871978d8 100644 --- a/src/openmp/ompt_target_matmult.c +++ b/src/openmp/ompt_target_matmult.c @@ -13,7 +13,7 @@ #include #ifndef MATRIX_SIZE -#define MATRIX_SIZE 1024 +#define MATRIX_SIZE 512 #endif #define MAX_ITERATIONS 3 diff --git a/src/openmp/ompt_target_vector_add.c b/src/openmp/ompt_target_vector_add.c index 0835de9e..d5723a62 100644 --- a/src/openmp/ompt_target_vector_add.c +++ b/src/openmp/ompt_target_vector_add.c @@ -4,8 +4,8 @@ #include #include -#define ARRAY_SIZE 1024*1024*512 -#define ITERATIONS 10 +#define ARRAY_SIZE 512*512*512 +#define ITERATIONS 3 int run_cpu( int argc, char** argv ) { printf( "The total memory allocated is %7.3lf MB.\n", From aa07f1cd73c2a47265ea2684fb040374ab91f665 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 25 Jan 2023 11:54:12 -0800 Subject: [PATCH 04/13] Finally fixed the intel timestamps for tracing. --- src/apex/L0/utils.h | 2 +- src/apex/L0/ze_utils.h | 14 +-- src/apex/apex_clock.hpp | 4 + src/apex/apex_level0.cpp | 235 ++++++++++++--------------------------- 4 files changed, 85 insertions(+), 170 deletions(-) diff --git a/src/apex/L0/utils.h b/src/apex/L0/utils.h index 9def9b00..4b6c3787 100644 --- a/src/apex/L0/utils.h +++ b/src/apex/L0/utils.h @@ -48,7 +48,7 @@ namespace utils { #if defined(__gnu_linux__) inline uint64_t GetTime(clockid_t id) { - timespec ts{0}; + timespec ts{0,0}; int status = clock_gettime(id, &ts); PTI_ASSERT(status == 0); return ts.tv_sec * NSEC_IN_SEC + ts.tv_nsec; diff --git a/src/apex/L0/ze_utils.h b/src/apex/L0/ze_utils.h index 323b7fb5..4615592b 100644 --- a/src/apex/L0/ze_utils.h +++ b/src/apex/L0/ze_utils.h @@ -96,7 +96,7 @@ namespace utils { for (auto driver : GetDriverList()) { for (auto device : GetDeviceList(driver)) { - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); if (props.type == ZE_DEVICE_TYPE_GPU) { @@ -120,7 +120,7 @@ namespace utils { for (auto driver : GetDriverList()) { for (auto device : GetDeviceList(driver)) { - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); if (props.type == ZE_DEVICE_TYPE_GPU) { @@ -169,7 +169,7 @@ namespace utils { inline std::string GetDeviceName(ze_device_handle_t device) { PTI_ASSERT(device != nullptr); ze_result_t status = ZE_RESULT_SUCCESS; - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, nullptr}; status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); return props.name; @@ -336,7 +336,7 @@ namespace utils { inline uint64_t GetDeviceTimerFrequency(ze_device_handle_t device) { PTI_ASSERT(device != nullptr); - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); return props.timerResolution; @@ -344,7 +344,7 @@ namespace utils { inline uint64_t GetMetricTimerFrequency(ze_device_handle_t device) { PTI_ASSERT(device != nullptr); - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); return props.timerResolution; @@ -352,7 +352,7 @@ namespace utils { inline uint64_t GetDeviceTimestampMask(ze_device_handle_t device) { PTI_ASSERT(device != nullptr); - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); return (1ull << props.kernelTimestampValidBits) - 1ull; @@ -362,7 +362,7 @@ namespace utils { #ifdef PTI_OA_TIMESTAMP_VALID_BITS return (1ull << PTI_OA_TIMESTAMP_VALID_BITS) - 1ull; #else - ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, nullptr}; ze_result_t status = zeDeviceGetProperties(device, &props); PTI_ASSERT(status == ZE_RESULT_SUCCESS); return (1ull << props.kernelTimestampValidBits) - 1ull; diff --git a/src/apex/apex_clock.hpp b/src/apex/apex_clock.hpp index ff964465..40f36d01 100644 --- a/src/apex/apex_clock.hpp +++ b/src/apex/apex_clock.hpp @@ -9,7 +9,11 @@ #pragma once #include +#if defined(APEX_WITH_LEVEL0) // needed to map to GPU time +#define MYCLOCK std::chrono::steady_clock +#else #define MYCLOCK std::chrono::system_clock +#endif namespace apex { diff --git a/src/apex/apex_level0.cpp b/src/apex/apex_level0.cpp index aa69646d..09b3aaf5 100644 --- a/src/apex/apex_level0.cpp +++ b/src/apex/apex_level0.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -34,15 +35,12 @@ using namespace apex; static ZeApiCollector* api_collector = nullptr; static ZeKernelCollector* kernel_collector = nullptr; static chrono::steady_clock::time_point start_time; -static int gpu_task_id = 0; -static int host_api_task_id = 0; -static uint64_t first_clock_timestamp; -static uint64_t first_cpu_timestamp; -static uint64_t first_gpu_timestamp; static uint64_t cpu_delta = 0L; static uint64_t gpu_delta = 0L; static uint64_t last_gpu_timestamp = 0L; static uint64_t gpu_offset = 0L; +static uint64_t device_resolution{0L}; +static uint64_t device_mask{0L}; // External Tool Interface //////////////////////////////////////////////////// @@ -77,97 +75,70 @@ void SetToolEnv() { namespace apex { namespace level0 { -static void PrintResults() { - chrono::steady_clock::time_point end = chrono::steady_clock::now(); - chrono::duration time = end - start_time; - - PTI_ASSERT(kernel_collector != nullptr); - const ZeKernelInfoMap& kernel_info_map = kernel_collector->GetKernelInfoMap(); - if (kernel_info_map.size() == 0) { - return; - } - - uint64_t total_duration = 0; - for (auto& value : kernel_info_map) { - total_duration += value.second.total_time; - } - - cerr << endl; - cerr << "=== Device Timing Results: ===" << endl; - cerr << endl; - cerr << "Total Execution Time (ns): " << time.count() << endl; - cerr << "Total Device Time (ns): " << total_duration << endl; - cerr << endl; - - if (total_duration > 0) { - ZeKernelCollector::PrintKernelsTable(kernel_info_map); - } - - cerr << endl; -} - -// Internal Tool Functionality //////////////////////////////////////////////// - -static void APIPrintResults() { - chrono::steady_clock::time_point end = chrono::steady_clock::now(); - chrono::duration time = end - start_time; - - PTI_ASSERT(api_collector != nullptr); - const ZeFunctionInfoMap& function_info_map = api_collector->GetFunctionInfoMap(); - if (function_info_map.size() == 0) { - return; - } - - uint64_t total_duration = 0; - for (auto& value : function_info_map) { - total_duration += value.second.total_time; - } - - cerr << endl; - cerr << "=== API Timing Results: ===" << endl; - cerr << endl; - cerr << "Total Execution Time (ns): " << time.count() << endl; - cerr << "Total API Time (ns): " << total_duration << endl; - cerr << endl; - - if (total_duration > 0) { - ZeApiCollector::PrintFunctionsTable(function_info_map); - } - - std::cerr << std::endl; -} - -uint64_t TAUTranslateGPUTimestamp(uint64_t gpu_ts) { - // gpu_ts is in nanoseconds. - uint64_t new_ts = gpu_ts + gpu_delta; - return new_ts; -} - -uint64_t TAUTranslateCPUTimestamp(uint64_t cpu_ts) { - // cpu_ts is in nanoseconds. - uint64_t new_ts = cpu_ts + cpu_delta; - return new_ts; -} - -void TAUOnAPIFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { +/* +taken from: https://github.com/intel/pti-gpu/blob/master/chapters/device_activity_tracing/LevelZero.md +Time Correlation + +Common problem while kernel timestamps collection is to map these timestamps +to general CPU timeline. Since Level Zero provides kernel timestamps in GPU +clocks, one may need to convert them to some CPU time. Starting from Level +Zero 1.1, new function zeDeviceGetGlobalTimestamps is available. Using this +function, one can get correlated host (CPU) and device (GPU) timestamps for +any particular device: + + uint64_t host_timestamp = 0, device_timestamp = 0; + ze_result_t status = zeDeviceGetGlobalTimestamps( + device, &host_timestamp, &device_timestamp); + assert(status == ZE_RESULT_SUCCESS); + +Host timestamp value corresponds to CLOCK_MONOTONIC_RAW on Linux or +QueryPerformanceCounter on Windows, while device timestamp for GPU is +collected in raw GPU cycles. + +Note that the number of valid bits for the device timestamp returned by +zeDeviceGetGlobalTimestamps is timestampValidBits, while the global kernel +timastamp returned by zeEventQueryKernelTimestamp has kernelTimestampValidBits +(both values are fields of ze_device_properties_t). And currently +kernelTimestampValidBits is less then timestampValidBits, so to map kernels +into CPU timeline one may need to truncate device timestamp to +kernelTimestampValidBits: + + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + assert(status == ZE_RESULT_SUCCESS); + uint64_t mask = (1ull << props.kernelTimestampValidBits) - 1ull; + uint64_t kernel_timestamp = (device_timestamp & mask); + +To convert GPU cycles into seconds one may use timerResolution field from +ze_device_properties_t structure, that represents cycles per second starting +from Level Zero 1.2: + + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + assert(status == ZE_RESULT_SUCCESS); + const uint64_t NSEC_IN_SEC = 1000000000; + uint64_t device_timestamp_ns = NSEC_IN_SEC * device_timestamp / props.timerResolution; + +*/ + +/* That said, all the timestamps are converted for us in ze_kernel_collector.h. + However, we need to apply a delta. + */ + +void OnAPIFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { uint64_t taskid; taskid = *((uint64_t *) data); - uint64_t started_translated = TAUTranslateCPUTimestamp(started); - uint64_t ended_translated = TAUTranslateCPUTimestamp(ended); - DEBUG_PRINT("APEX: OnAPIFinishCallback: (raw) name: %s started: %lu ended: %lu task id=%lu\n", - name.c_str(), started, ended, taskid); - DEBUG_PRINT("APEX: OnAPIFinishCallback: (translated) name: %s started: %lu ended: %lu task id=%lu\n", - name.c_str(), started_translated, ended_translated, taskid); - // We now need to start a timer on a task at the started_translated time and end at ended_translated + DEBUG_PRINT("APEX: OnAPIFinishCallback: (raw) name: %s started: %lu ended: %lu at: %lu task id=%lu\n", + name.c_str(), started, ended, profiler::now_ns(), taskid); // create a task_wrapper, as a child of the current timer auto tt = new_task(name, UINT64_MAX, nullptr); // create an APEX profiler to store this data - we can't start // then stop because we have timestamps already. auto prof = std::make_shared(tt); - prof->set_start(started_translated); - prof->set_end(ended_translated); + prof->set_start(started); + prof->set_end(ended); // important! Otherwise we might get the wrong end timestamp. prof->stopped = true; // Get the singleton APEX instance @@ -244,16 +215,20 @@ void store_profiler_data(const std::string &name, } -void TAUOnKernelFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { +void OnKernelFinishCallback(void *data, const std::string& name, uint64_t started, uint64_t ended) { int taskid; taskid = *((int *) data); - uint64_t started_translated = TAUTranslateGPUTimestamp(started); - uint64_t ended_translated = TAUTranslateGPUTimestamp(ended); - DEBUG_PRINT("APEX: : (raw) name: %s started: %lu ended: %lu task id=%d\n", - name.c_str(), started, ended, taskid); - DEBUG_PRINT("APEX: : (raw) name: %s started: %lu ended: %lu task id=%d\n", - name.c_str(), started_translated, ended_translated, taskid); + /* We get a start and stop timestamp from the API in nanoseconds - but they + only make sense relative to each other. however, we're getting a callback + at exactly the time the kernel finishes, so we can assume the end time is + now, and then take a delta from now for the start time. */ + uint64_t ended_translated = profiler::now_ns(); + uint64_t started_translated = ended_translated - (ended - started); + DEBUG_PRINT("APEX: : (raw) name: %s started: %20lu ended: %20lu at: %20lu task id=%d\n", + name.substr(0,10).c_str(), started, ended, profiler::now_ns(), taskid); + DEBUG_PRINT("APEX: : (raw) name: %s started: %20lu ended: %20lu at: %20lu task id=%d\n", + name.substr(0,10).c_str(), started_translated, ended_translated, profiler::now_ns(), taskid); last_gpu_timestamp = ended; int device_num = 0; @@ -289,58 +264,18 @@ void EnableProfiling() { return; } + // register a callback for Kernel calls uint64_t *kernel_taskid = new uint64_t; - //TAU_CREATE_TASK(*kernel_taskid); void *pk = (void *) kernel_taskid; - gpu_task_id = *kernel_taskid; - uint64_t *api_taskid = new uint64_t; - //*host_taskid = RtsLayer::myThread(); - //TAU_CREATE_TASK(*api_taskid); - host_api_task_id = *api_taskid; kernel_collector = ZeKernelCollector::Create(driver, - TAUOnKernelFinishCallback, pk); - /* - //uint64_t gpu_ts = utils::i915::GetGpuTimestamp() & 0x0FFFFFFFF; - uint64_t gpu_ts = utils::i915::GetGpuTimestamp() ; - std::cout <<"TAU: Earliest GPU timestamp "<DisableTracing(); - //if (TauEnv_get_verbose()) - //PrintResults(); delete kernel_collector; } if (api_collector != nullptr) { api_collector->DisableTracing(); - //if (TauEnv_get_verbose()) - //APIPrintResults(); delete api_collector; } - //uint64_t gpu_end_ts = utils::i915::GetGpuTimestamp() & 0x0FFFFFFFF; - /* - uint64_t gpu_end_ts = utils::i915::GetGpuTimestamp(); - std::cout <<"APEX: Latest GPU timestamp "< chrono_dt = chrono_end - start_time; - DEBUG_PRINT("APEX: Diff (chrono) =%ld \n", chrono_dt.count()); } } // namespace level0 From a0ac36127ca157c15df96512688fe34e0fbd1451 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 25 Jan 2023 14:27:20 -0800 Subject: [PATCH 05/13] Making perfetto off by default. --- cmake/Modules/APEX_DefaultOptions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/APEX_DefaultOptions.cmake b/cmake/Modules/APEX_DefaultOptions.cmake index c08f8c72..8b32304a 100644 --- a/cmake/Modules/APEX_DefaultOptions.cmake +++ b/cmake/Modules/APEX_DefaultOptions.cmake @@ -17,7 +17,7 @@ option (APEX_WITH_STARPU "Enable APEX StarPU support" FALSE) option (APEX_WITH_TCMALLOC "Enable TCMalloc heap management" FALSE) option (APEX_WITH_JEMALLOC "Enable JEMalloc heap management" FALSE) option (APEX_WITH_LM_SENSORS "Enable LM Sensors support" FALSE) -option (APEX_WITH_PERFETTO "Enable native Perfetto trace support" TRUE) +option (APEX_WITH_PERFETTO "Enable native Perfetto trace support" FALSE) option (APEX_BUILD_TESTS "Build APEX tests (for 'make test')" FALSE) option (APEX_CUDA_TESTS "Build APEX CUDA tests (for 'make test')" FALSE) option (APEX_HIP_TESTS "Build APEX HIP tests (for 'make test')" FALSE) From bb47c00e8e29e3352a30a15eaa20e5089a631f4c Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 31 Jan 2023 12:42:51 -0500 Subject: [PATCH 06/13] Debugging on Crusher --- CMakeLists.txt | 4 +++ src/apex/apex_error_handling.cpp | 59 ++++++++++++++++++++++++------ src/apex/apex_rocm_smi.cpp | 18 ++++++---- src/apex/hip_trace.cpp | 11 ++++++ src/apex/profiler_listener.cpp | 2 ++ src/scripts/apex_exec | 5 ++- src/unit_tests/C/CMakeLists.txt | 1 + src/unit_tests/C/crasher.c | 62 ++++++++++++++++++++++++++++++++ 8 files changed, 144 insertions(+), 18 deletions(-) create mode 100644 src/unit_tests/C/crasher.c diff --git a/CMakeLists.txt b/CMakeLists.txt index da2eaa1a..b9fa12c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,6 +294,10 @@ endif (DEFINED APEX_SANITIZE_THREAD AND APEX_SANITIZE_THREAD) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${APEX_SANITIZE_OPTIONS} -DAPEX_ERROR_HANDLING") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${APEX_SANITIZE_OPTIONS} -DAPEX_ERROR_HANDLING") +if (DEFINED APEX_ERROR_HANDLING AND APEX_ERROR_HANDLING) + add_definitions(-DAPEX_ERROR_HANDLING) +endif (DEFINED APEX_ERROR_HANDLING AND APEX_ERROR_HANDLING) + # --------------------------------------------------- # Set pedantic error flags if available # --------------------------------------------------- diff --git a/src/apex/apex_error_handling.cpp b/src/apex/apex_error_handling.cpp index f1d5eb94..d87a0cb9 100644 --- a/src/apex/apex_error_handling.cpp +++ b/src/apex/apex_error_handling.cpp @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "thread_instance.hpp" #include "address_resolution.hpp" #include @@ -62,9 +64,9 @@ static void apex_custom_signal_handler(int sig) { //std::unique_lock l(output_mutex); fflush(stderr); std::cerr << std::endl; - std::cerr << "********* Thread " << apex::thread_instance::get_id() << " " << - strsignal(sig) << " *********"; - std::cerr << std::endl; + std::cerr << "********* Node " << apex::apex::instance()->get_node_id() << + ", Thread " << apex::thread_instance::get_id() << " " << + strsignal(sig) << " *********" << std::endl; std::cerr << std::endl; if(errnum) { std::cerr << "Value of errno: " << errno << std::endl; @@ -83,16 +85,53 @@ static void apex_custom_signal_handler(int sig) { _exit(-1); } +std::map other_handlers; + +static void apex_custom_signal_handler_advanced(int sig, siginfo_t * info, void * context) { + apex_custom_signal_handler(sig); + // call the old handler + other_handlers[sig].sa_sigaction(sig, info, context); +} + int apex_register_signal_handler() { + if (apex::test_for_MPI_comm_rank(0) == 0) { + std::cout << "APEX signal handler registering..." << std::endl; + } struct sigaction act; + struct sigaction old; + memset(&act, 0, sizeof(act)); + memset(&old, 0, sizeof(old)); + sigemptyset(&act.sa_mask); - act.sa_flags = 0; - act.sa_handler = apex_custom_signal_handler; - sigaction( SIGILL, &act, nullptr); - sigaction( SIGABRT, &act, nullptr); - sigaction( SIGFPE, &act, nullptr); - sigaction( SIGSEGV, &act, nullptr); - sigaction( SIGBUS, &act, nullptr); + std::array mysignals = { + SIGHUP, + SIGINT, + SIGQUIT, + SIGILL, + //SIGTRAP, + SIGIOT, + SIGBUS, + SIGFPE, + SIGKILL, + SIGSEGV, + SIGABRT, + SIGTERM, + SIGSTKFLT, + SIGXCPU, + SIGXFSZ, + SIGPWR + }; + //act.sa_flags = 0; + //act.sa_handler = apex_custom_signal_handler; + act.sa_flags = SA_RESTART | SA_SIGINFO; + act.sa_sigaction = apex_custom_signal_handler_advanced; + for (auto s : mysignals) { + sigaction(s, &act, &old); + other_handlers[s] = old; + } + if (apex::test_for_MPI_comm_rank(0) == 0) { + std::cout << "APEX signal handler registered!" << std::endl; + } return 0; } diff --git a/src/apex/apex_rocm_smi.cpp b/src/apex/apex_rocm_smi.cpp index 41d513af..543c48d4 100644 --- a/src/apex/apex_rocm_smi.cpp +++ b/src/apex/apex_rocm_smi.cpp @@ -63,12 +63,14 @@ monitor::monitor (void) { RSMI_CALL(rsmi_num_monitor_devices(&deviceCount)); rsmi_version_t version; RSMI_CALL(rsmi_version_get(&version)); - std::cout << "RSMI Version " - << version.major << "." - << version.minor << "." - << version.patch << " build " - << version.build << ", Found " - << deviceCount << " total devices" << std::endl; + if (apex_options::use_verbose()) { + std::cout << "RSMI Version " + << version.major << "." + << version.minor << "." + << version.patch << " build " + << version.build << ", Found " + << deviceCount << " total devices" << std::endl; + } //devices.reserve(deviceCount); // get the unit handles @@ -131,7 +133,9 @@ void monitor::query(void) { APEX_UNUSED(timestamp); if (!queried_once[d]) { - std::cout << deviceInfos[d].to_string() << std::endl; + if (apex_options::use_verbose()) { + std::cout << deviceInfos[d].to_string() << std::endl; + } } // power, in microwatts diff --git a/src/apex/hip_trace.cpp b/src/apex/hip_trace.cpp index 2e47e0d3..8e88633e 100644 --- a/src/apex/hip_trace.cpp +++ b/src/apex/hip_trace.cpp @@ -968,6 +968,17 @@ void store_counter_data(const char * name, const std::string& ctx, void process_hip_record(const roctracer_record_t* record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); + if (!apex::apex_options::use_hip_kernel_details()) { + if (strncmp(name, "Marker", 6) == 0) { + // if there's a correlation ID, clear it from the maps + if (record->correlation_id > 0) { + Globals::find_timer(record->correlation_id); + Globals::find_data(record->correlation_id); + } + // do nothing, this event is annoying. + return; + } + } switch(record->op) { case HIP_OP_ID_DISPATCH: { std::string name = Globals::find_name(record->correlation_id); diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index 8eab1ea0..b5e1eb73 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -416,6 +416,8 @@ std::unordered_set free_profiles; int loc0 = task_scatterplot_samples.tellp(); if (loc0 > 32768) { counter_scatterplot_sample_file() << counter_scatterplot_samples.rdbuf(); + // flush the buffer, in case we crash + counter_scatterplot_sample_file().flush(); // reset the stringstream counter_scatterplot_samples.str(""); } diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 2cce4a16..63e5fdfb 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -494,7 +494,10 @@ if [ $ompt = yes ]; then export OMP_TOOL=enabled export OMP_TOOL_LIBRARIES=${BASEDIR}/${LIBDIR}/${APEX_LIBRARY_NAME}${SHLIBX} export OMP_TOOL_VERBOSE_INIT=stdout - export OMP_DISPLAY_ENV=true + OMPT_LIB=@OMPT_LIBRARY@: + if [ $verbose = yes ]; then + export OMP_DISPLAY_ENV=true + fi fi if [ $cpu_memory = yes ]; then MEMORY_LIB=${BASEDIR}/${LIBDIR}/libapex_memory_wrapper${SHLIBX}: diff --git a/src/unit_tests/C/CMakeLists.txt b/src/unit_tests/C/CMakeLists.txt index 3529e7b2..2f26f7af 100644 --- a/src/unit_tests/C/CMakeLists.txt +++ b/src/unit_tests/C/CMakeLists.txt @@ -38,6 +38,7 @@ set(example_programs apex_get_thread_cap apex_shutdown_throttling apex_disable + crasher ${APEX_OPENACC_TEST} ) # apex_setup_power_cap_throttling diff --git a/src/unit_tests/C/crasher.c b/src/unit_tests/C/crasher.c new file mode 100644 index 00000000..b4e90b3f --- /dev/null +++ b/src/unit_tests/C/crasher.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include + +#define handle_error(msg) \ + do { perror(msg); exit(EXIT_FAILURE); } while (0) + +char *buffer; +int flag=0; + +int main(int argc, char *argv[]) +{ + char *p; char a; + int pagesize; + + pagesize=4096; + + /* Allocate a buffer aligned on a page boundary; + initial protection is PROT_READ | PROT_WRITE */ + + buffer = memalign(pagesize, 4 * pagesize); + if (buffer == NULL) + handle_error("memalign"); + + printf("Start of region: 0x%lx\n", (long) buffer); + printf("Start of region: 0x%lx\n", (long) buffer+pagesize); + printf("Start of region: 0x%lx\n", (long) buffer+2*pagesize); + printf("Start of region: 0x%lx\n", (long) buffer+3*pagesize); + //if (mprotect(buffer + pagesize * 0, pagesize,PROT_NONE) == -1) + if (mprotect(buffer + pagesize * 0, pagesize,PROT_NONE) == -1) + handle_error("mprotect"); + + //for (p = buffer ; ; ) + if(flag==0) + { + p = buffer+pagesize/2; + printf("It comes here before reading memory\n"); + a = *p; //trying to read the memory + printf("It comes here after reading memory\n"); + } + else + { + if (mprotect(buffer + pagesize * 0, pagesize,PROT_READ) == -1) + handle_error("mprotect"); + a = *p; + printf("Now i can read the memory\n"); + + } +/* for (p = buffer;p<=buffer+4*pagesize ;p++ ) + { + //a = *(p); + *(p) = 'a'; + printf("Writing at address %p\n",p); + + }*/ + + printf("Loop completed\n"); /* Should never happen */ + exit(EXIT_SUCCESS); +} \ No newline at end of file From e6fbb541300397478c1518b2d5ebd811e1541fc8 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 31 Jan 2023 09:56:22 -0800 Subject: [PATCH 07/13] Replacing broken CSV output with reduced CSV output from all ranks. --- src/apex/apex_mpi.cpp | 6 +- src/apex/profile.hpp | 4 +- src/apex/profile_reducer.cpp | 129 +++++++++++++++++- src/apex/profile_reducer.hpp | 9 +- src/apex/profiler_listener.cpp | 73 +--------- src/apex/profiler_listener.hpp | 1 - .../LuleshMPIOpenMP/synchronous_policy.cpp | 8 ++ 7 files changed, 156 insertions(+), 74 deletions(-) diff --git a/src/apex/apex_mpi.cpp b/src/apex/apex_mpi.cpp index 80f41b07..13559471 100644 --- a/src/apex/apex_mpi.cpp +++ b/src/apex/apex_mpi.cpp @@ -22,7 +22,7 @@ #include "mpi.h" #endif -#define MPI_START_TIMER static auto p = apex::new_task(__APEX_FUNCTION__); apex::start(p); +#define MPI_START_TIMER auto p = apex::new_task(__APEX_FUNCTION__); apex::start(p); #define MPI_STOP_TIMER apex::stop(p); /* Implementation of the C API */ @@ -277,7 +277,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou inline int apex_measure_mpi_sync(MPI_Comm comm, const char * name, std::shared_ptr parent) { APEX_UNUSED(name); //auto _p = start(std::string(name)+" (sync)"); - static auto _p = new_task("MPI Collective Sync", UINTMAX_MAX, parent); + auto _p = new_task("MPI Collective Sync", UINTMAX_MAX, parent); start(_p); int _retval = PMPI_Barrier(comm); stop(_p); @@ -600,7 +600,7 @@ void _symbol(MPI_Fint * request, MPI_Fint * status, MPI_Fint * ierr) { \ int MPI_Barrier(MPI_Comm comm) { MPI_START_TIMER - static auto _p = apex::new_task("MPI Collective Sync"); + auto _p = apex::new_task("MPI Collective Sync"); apex::start(_p); int retval = PMPI_Barrier(comm); apex::stop(_p); diff --git a/src/apex/profile.hpp b/src/apex/profile.hpp index 53cb0251..e22b1194 100644 --- a/src/apex/profile.hpp +++ b/src/apex/profile.hpp @@ -177,7 +177,9 @@ class profile { return _profile.accumulated; } double get_inclusive_accumulated() { - return std::max(_profile.accumulated,_profile.inclusive_accumulated); + if (_profile.type == APEX_TIMER) + return std::max(_profile.accumulated,_profile.inclusive_accumulated); + return 0.0; } double get_accumulated_mean_threads() { return (_profile.accumulated / (double)(_profile.num_threads)); diff --git a/src/apex/profile_reducer.cpp b/src/apex/profile_reducer.cpp index 22b10927..2821f159 100644 --- a/src/apex/profile_reducer.cpp +++ b/src/apex/profile_reducer.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include /* 11 values per timer/counter by default @@ -42,7 +43,7 @@ constexpr size_t num_fields{23}; namespace apex { -std::map reduce_profiles() { +std::map reduce_profiles_for_screen() { int commrank = 0; int commsize = 1; #if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) @@ -263,5 +264,129 @@ std::map reduce_profiles() { return (all_profiles); } -} + void reduce_profiles(std::stringstream& csv_output, std::string filename) { + int commrank = 0; + int commsize = 1; +#if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) + int mpi_initialized = 0; + MPI_CALL(MPI_Initialized( &mpi_initialized )); + if (mpi_initialized) { + MPI_CALL(PMPI_Comm_rank(MPI_COMM_WORLD, &commrank)); + MPI_CALL(PMPI_Comm_size(MPI_COMM_WORLD, &commsize)); + } +#endif + // if nothing to reduce, just write the data. + if (commsize == 1) { + std::ofstream csvfile; + std::stringstream csvname; + csvname << apex_options::output_file_path(); + csvname << filesystem_separator() << filename; + std::cout << "Writing: " << csvname.str() << std::endl; + csvfile.open(csvname.str(), std::ios::out); + csvfile << csv_output.str(); + csvfile.close(); + return; + } + + size_t length{csv_output.str().size()}; + size_t max_length{length}; + // get the longest string from all ranks +#if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) + if (mpi_initialized && commsize > 1) { + MPI_CALL(PMPI_Allreduce(&length, &max_length, 1, + MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD)); + } + // so we don't have to specially handle the first string which will append + // the second string without a null character (zero). + max_length = max_length + 1; +#endif + // allocate the memory to hold all output + char * rbuf = nullptr; + if (commrank == 0) { + rbuf = (char*)calloc(max_length * commsize, sizeof(char)); + } + char * sbuf = (char*)calloc(max_length, sizeof(char)); + strncpy(sbuf, csv_output.str().c_str(), length); + MPI_Gather(sbuf, max_length, MPI_CHAR, rbuf, max_length, MPI_CHAR, 0, MPI_COMM_WORLD); + + if (commrank == 0) { + std::ofstream csvfile; + std::stringstream csvname; + csvname << apex_options::output_file_path(); + csvname << filesystem_separator() << filename; + std::cout << "Writing: " << csvname.str() << std::endl; + csvfile.open(csvname.str(), std::ios::out); + char * index = rbuf; + for (auto i = 0 ; i < commsize ; i++) { + index = rbuf+(i*max_length); + std::string tmpstr{index}; + csvfile << tmpstr; + csvfile.flush(); + } + csvfile.close(); + } + free(sbuf); + free(rbuf); + } + + void reduce_flat_profiles(int node_id, int num_papi_counters, + std::vector metric_names, profiler_listener* listener) { + + std::stringstream csv_output; + if (node_id == 0) { + csv_output << "\"rank\",\"name\",\"type\",\"num samples/calls\",\"minimum\",\"mean\"," + << "\"maximum\",\"stddev\",\"total\",\"inclusive (ns)\",\"num threads\",\"total per thread\""; +#if APEX_HAVE_PAPI + for (int i = 0 ; i < num_papi_counters ; i++) { + csv_output << ",\"" << metric_names[i] << "\""; + } +#endif + if (apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) { + csv_output << ",\"allocations\", \"bytes allocated\", \"frees\", \"bytes freed\""; + } + csv_output << std::endl; + } + + /* Get a list of all profile names */ + std::vector& tids = get_available_profiles(); + for (auto tid : tids) { + std::string name{tid.get_name()}; + auto p = listener->get_profile(tid); + csv_output << node_id << ",\"" << name << "\","; + if (p->get_type() == APEX_TIMER) { + csv_output << "\"timer\","; + } else { + csv_output << "\"counter\","; + } + csv_output << llround(p->get_calls()) << ","; + // add all the extra columns for counter and timer data + csv_output << std::llround(p->get_minimum()) << ","; + csv_output << std::llround(p->get_mean()) << ","; + csv_output << std::llround(p->get_maximum()) << ","; + csv_output << std::llround(p->get_stddev()) << ","; + csv_output << std::llround(p->get_accumulated()) << ","; + if (p->get_type() == APEX_TIMER) { + csv_output << std::llround(p->get_inclusive_accumulated()) << ","; + } else { + csv_output << std::llround(0.0) << ","; + } + csv_output << std::llround(p->get_num_threads()) << ","; + csv_output << std::llround(p->get_accumulated()/p->get_num_threads()); +#if APEX_HAVE_PAPI + for (int i = 0 ; i < num_papi_counters ; i++) { + csv_output << "," << std::llround(p->get_papi_metrics()[i]); + } +#endif + if (apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) { + csv_output << "," << p->get_allocations(); + csv_output << "," << p->get_bytes_allocated(); + csv_output << "," << p->get_frees(); + csv_output << "," << p->get_bytes_freed(); + } + csv_output << std::endl; + } + reduce_profiles(csv_output, "apex_flat_profiles.csv"); + } + +} // namespace diff --git a/src/apex/profile_reducer.hpp b/src/apex/profile_reducer.hpp index 34cfe907..85eb7562 100644 --- a/src/apex/profile_reducer.hpp +++ b/src/apex/profile_reducer.hpp @@ -8,10 +8,17 @@ #include #include +#include #include "apex_types.h" +#include "profiler_listener.hpp" namespace apex { -std::map reduce_profiles(); +std::map reduce_profiles_for_screen(); + +void reduce_profiles(std::stringstream& csv_output, std::string filename); +void reduce_flat_profiles(int node_id, int num_papi_counters, + std::vector metric_names, + profiler_listener* listener); } diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index 8eab1ea0..7e38694e 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -488,7 +488,7 @@ std::unordered_set free_profiles; void profiler_listener::write_one_timer(std::string &action_name, profile * p, stringstream &screen_output, - stringstream &csv_output, double &total_accumulated, + double &total_accumulated, double &total_main, double &wall_main, bool include_stops = false, bool include_papi = false) { #ifndef APEX_HAVE_PAPI @@ -540,17 +540,6 @@ std::unordered_set free_profiles; } } if (p->get_type() == APEX_TIMER) { - csv_output << "\"" << action_name << "\",\"timer\","; - csv_output << llround(p->get_calls()) << ","; - // add all the extra columns for counter data - csv_output << std::llround(p->get_minimum()) << ","; - csv_output << std::llround(p->get_mean()) << ","; - csv_output << std::llround(p->get_maximum()) << ","; - csv_output << std::llround(p->get_stddev()) << ","; - csv_output << std::llround(p->get_accumulated_useconds()) << ","; - csv_output << std::llround(p->get_inclusive_accumulated_useconds()) << ","; - csv_output << std::llround(p->get_num_threads()) << ","; - csv_output << std::llround(p->get_accumulated_useconds()/p->get_num_threads()); //screen_output << " --n/a-- " ; if (include_stops) { if (p->get_num_threads() > 10000) { @@ -613,7 +602,6 @@ std::unordered_set free_profiles; for (int i = 0 ; i < num_papi_counters ; i++) { screen_output << " " << string_format(FORMAT_SCIENTIFIC, (p->get_papi_metrics()[i])); - csv_output << "," << std::llround(p->get_papi_metrics()[i]); } } #endif @@ -626,7 +614,6 @@ std::unordered_set free_profiles; screen_output << " " << string_format(PAD_WITH_SPACES, to_string(std::llround(p->get_allocations())).c_str()); } - csv_output << "," << p->get_allocations(); if (p->get_bytes_allocated() > 999999) { screen_output << " " << string_format(FORMAT_SCIENTIFIC, @@ -635,7 +622,6 @@ std::unordered_set free_profiles; screen_output << " " << string_format(PAD_WITH_SPACES, to_string(std::llround(p->get_bytes_allocated())).c_str()); } - csv_output << "," << p->get_bytes_allocated(); if (p->get_frees() > 999999) { screen_output << " " << string_format(FORMAT_SCIENTIFIC, @@ -644,7 +630,6 @@ std::unordered_set free_profiles; screen_output << " " << string_format(PAD_WITH_SPACES, to_string(std::llround(p->get_frees())).c_str()); } - csv_output << "," << p->get_frees(); if (p->get_bytes_freed() > 999999) { screen_output << " " << string_format(FORMAT_SCIENTIFIC, @@ -653,33 +638,10 @@ std::unordered_set free_profiles; screen_output << " " << string_format(PAD_WITH_SPACES, to_string(std::llround(p->get_bytes_freed())).c_str()); } - csv_output << "," << p->get_bytes_freed(); } - //} else { - //csv_output << ",0,0,0,0"; } screen_output << endl; - csv_output << endl; } else { - /* Do CSV output */ - csv_output << "\"" << action_name << "\",\"counter\","; - csv_output << llround(p->get_calls()) << ","; - csv_output << std::llround(p->get_minimum()) << ","; - csv_output << std::llround(p->get_mean()) << ","; - csv_output << std::llround(p->get_maximum()) << ","; - csv_output << std::llround(p->get_stddev()) << ","; - // add all the extra columns for timer data - csv_output << "0,0,0,0"; // accumulated seconds, inclusive, threads, per thread - meaningless -#if APEX_HAVE_PAPI - for (int i = 0 ; i < num_papi_counters ; i++) { - csv_output << ",0"; - } - if (apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) { - csv_output << std::string( 4, ',' ); - } -#endif - csv_output << endl; - /* Do Screen output */ if (action_name.find('%') == string::npos && p->get_minimum() > 10000) { screen_output << string_format(FORMAT_SCIENTIFIC, p->get_minimum()) << " " ; @@ -744,9 +706,6 @@ std::unordered_set free_profiles; // create a stringstream to hold all the screen output - we may not // want to write it out stringstream screen_output; - // create a stringstream to hold all the CSV output - we may not - // want to write it out - stringstream csv_output; // iterate over the profiles in the address map screen_output << endl << "Start Date/Time: " << timestamp_started; screen_output << endl << "Elapsed time: " << wall_clock_main @@ -777,17 +736,6 @@ std::unordered_set free_profiles; } } } - csv_output << "\"name\",\"type\",\"num samples/calls\",\"minimum\",\"mean\"," - << "\"maximum\",\"stddev\",\"total microseconds\",\"inclusive microseconds\",\"num threads\",\"total per thread\""; -#if APEX_HAVE_PAPI - for (int i = 0 ; i < num_papi_counters ; i++) { - csv_output << ",\"" << metric_names[i] << "\""; - } -#endif - if (apex_options::track_cpu_memory() || apex_options::track_gpu_memory()) { - csv_output << ",\"allocations\", \"bytes allocated\", \"frees\", \"bytes freed\""; - } - csv_output << endl; if (id_vector.size() > 0) { screen_output << "Counter : " << " #samp | minimum | mean | maximum | stddev " << endl; @@ -801,7 +749,7 @@ std::unordered_set free_profiles; auto p = all_profiles.find(name); if (p != all_profiles.end()) { profile tmp(p->second); - write_one_timer(name, &tmp, screen_output, csv_output, + write_one_timer(name, &tmp, screen_output, total_accumulated, divisor, wall_clock_main); } } @@ -866,7 +814,7 @@ std::unordered_set free_profiles; auto p = all_profiles.find(name); if (p != all_profiles.end()) { profile tmp(p->second); - write_one_timer(name, &tmp, screen_output, csv_output, + write_one_timer(name, &tmp, screen_output, total_accumulated, divisor, wall_clock_main); if (name.compare(APEX_MAIN_STR) != 0) { total_hpx_threads = total_hpx_threads + tmp.get_calls(); @@ -901,7 +849,7 @@ std::unordered_set free_profiles; // write the main timer std::string tmp_main(APEX_MAIN_STR); - write_one_timer(tmp_main, total_time, screen_output, csv_output, + write_one_timer(tmp_main, total_time, screen_output, total_accumulated, divisor, wall_clock_main, true, true); // iterate over the timers for(auto& pair_itr : timer_vector) { @@ -910,7 +858,7 @@ std::unordered_set free_profiles; auto p = all_profiles.find(name); if (p != all_profiles.end()) { profile tmp(p->second); - write_one_timer(name, &tmp, screen_output, csv_output, + write_one_timer(name, &tmp, screen_output, total_accumulated, divisor, wall_clock_main, true, true); if (name.compare(APEX_MAIN_STR) != 0) { total_hpx_threads = total_hpx_threads + tmp.get_calls(); @@ -961,14 +909,7 @@ std::unordered_set free_profiles; } if (apex_options::use_csv_output()) { - ofstream csvfile; - stringstream csvname; - csvname << apex_options::output_file_path(); - csvname << filesystem_separator() << "apex." << node_id << ".csv"; - // std::cout << "Writing: " << csvname.str() << std::endl; - csvfile.open(csvname.str(), ios::out); - csvfile << csv_output.str(); - csvfile.close(); + reduce_flat_profiles(node_id, num_papi_counters, metric_names, this); } if (apex_options::use_tau()) { tau_listener::Tau_stop_wrapper("profiler_listener::finalize_profiles"); @@ -1781,7 +1722,7 @@ if (rc != 0) cout << "PAPI error! " << name << ": " << PAPI_strerror(rc) << endl if (apex_options::use_screen_output() || apex_options::use_csv_output()) { // reduce/gather all profiles from all ranks - auto reduced = reduce_profiles(); + auto reduced = reduce_profiles_for_screen(); if (apex_options::process_async_state()) { finalize_profiles(data, reduced); } diff --git a/src/apex/profiler_listener.hpp b/src/apex/profiler_listener.hpp index 2b10d16f..d4fc07e0 100644 --- a/src/apex/profiler_listener.hpp +++ b/src/apex/profiler_listener.hpp @@ -100,7 +100,6 @@ class profiler_listener : public event_listener { std::shared_ptr main_timer; void write_one_timer(std::string &name, profile * p, std::stringstream &screen_output, - std::stringstream &csv_output, double &total_accumulated, double &total_main, double &wall_main, bool include_stops, bool include_papi); diff --git a/src/examples/LuleshMPIOpenMP/synchronous_policy.cpp b/src/examples/LuleshMPIOpenMP/synchronous_policy.cpp index d263e0a9..6e594985 100644 --- a/src/examples/LuleshMPIOpenMP/synchronous_policy.cpp +++ b/src/examples/LuleshMPIOpenMP/synchronous_policy.cpp @@ -54,16 +54,24 @@ int apex_example_policy_func(apex_context const context) { double one_worker = (1.0/__active_threads)*0.51; if (((mytimer/outvalues[1]) > ((__active_threads/outvalues[0]) + one_worker)) && (__active_threads < apex::apex_options::throttling_max_threads())) { + /* std::cout << __myrank << ": " << one_worker << " timer: " << (mytimer/outvalues[1]) << " thread: " << (__active_threads/outvalues[0]) << std::endl; + */ __active_threads++; + /* std::cout << __myrank << ": New thread count: " << __active_threads << std::endl; + */ } else if (((mytimer/outvalues[1]) < ((__active_threads/outvalues[0]) - one_worker)) && (__active_threads > apex::apex_options::throttling_min_threads())) { + /* std::cout << __myrank << ": " << one_worker << " timer: " << (mytimer/outvalues[1]) << " thread: " << (__active_threads/outvalues[0]) << std::endl; + */ __active_threads--; + /* std::cout << __myrank << ": New thread count: " << __active_threads << std::endl; + */ } return APEX_NOERROR; } From 907db729a8036af8505e0f911c7c96e50b2353d5 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 31 Jan 2023 14:30:05 -0800 Subject: [PATCH 08/13] Fully reduced CSV and tasktree output now. The tasktree output can be post-processed to generate any necessary Hatchet, Graphviz or Trilinos output if desired. --- src/apex/dependency_tree.cpp | 93 +++++++++++++++++++++++++--------- src/apex/dependency_tree.hpp | 28 ++++++---- src/apex/profile_reducer.cpp | 18 +++++-- src/apex/profiler_listener.cpp | 29 ++++++++--- 4 files changed, 125 insertions(+), 43 deletions(-) diff --git a/src/apex/dependency_tree.cpp b/src/apex/dependency_tree.cpp index 5b68a6d2..0f058c77 100644 --- a/src/apex/dependency_tree.cpp +++ b/src/apex/dependency_tree.cpp @@ -74,10 +74,10 @@ void Node::writeNode(std::ofstream& outfile, double total) { outfile << std::endl; } - double acc = (data == task_identifier::get_main_task_id() || accumulated == 0.0) ? - total : accumulated; + double acc = (data == task_identifier::get_main_task_id() || getAccumulated() == 0.0) ? + total : getAccumulated(); node_color * c = get_node_color_visible(acc, 0.0, total, data->get_tree_name()); - double ncalls = (calls == 0) ? 1 : calls; + double ncalls = (getCalls() == 0) ? 1 : getCalls(); std::string decoration; std::string font; @@ -127,20 +127,20 @@ double Node::writeNodeASCII(std::ofstream& outfile, double total, size_t indent) outfile << "|-> "; indent++; // write out the inclusive and percent of total - double acc = (data == task_identifier::get_main_task_id() || accumulated == 0.0) ? - total : accumulated; - double percentage = (accumulated / total) * 100.0; + double acc = (data == task_identifier::get_main_task_id() || getAccumulated() == 0.0) ? + total : getAccumulated(); + double percentage = (getAccumulated() / total) * 100.0; outfile << std::fixed << std::setprecision(precision) << acc << " - " << std::fixed << std::setprecision(precision) << percentage << "% ["; // write the number of calls - double ncalls = (calls == 0) ? 1 : calls; + double ncalls = (getCalls() == 0) ? 1 : getCalls(); outfile << std::fixed << std::setprecision(0) << ncalls << "]"; // write other stats - min, max, stddev double mean = acc / ncalls; // avoid -0.0 which will cause a -nan for stddev - double variance = std::max(0.0,((sumsqr / ncalls) - (mean * mean))); + double variance = std::max(0.0,((getSumSquares() / ncalls) - (mean * mean))); double stddev = sqrt(variance); - outfile << " {min=" << std::fixed << std::setprecision(precision) << min << ", max=" << max + outfile << " {min=" << std::fixed << std::setprecision(precision) << getMinimum() << ", max=" << getMaximum() << ", mean=" << mean << ", var=" << variance << ", std dev=" << stddev << ", inclusive=" << inclusive @@ -191,13 +191,13 @@ double Node::writeNodeJSON(std::ofstream& outfile, double total, size_t indent) << "\", \"type\": \"function\", \"rank\": " << apex::instance()->get_node_id() << "}, "; // write out the inclusive - double acc = (data == task_identifier::get_main_task_id() || accumulated == 0.0) ? - total : std::min(total, accumulated); + double acc = (data == task_identifier::get_main_task_id() || getAccumulated() == 0.0) ? + total : std::min(total, getAccumulated()); // solve for the exclusive double excl = acc; for (auto c : children) { - excl = excl - c.second->accumulated; + excl = excl - c.second->getAccumulated(); } if (excl < 0.0) { excl = 0.0; @@ -205,15 +205,15 @@ double Node::writeNodeJSON(std::ofstream& outfile, double total, size_t indent) // Don't write out synchronization events! They confuse the graph. if (data->get_tree_name().find("Synchronize") != std::string::npos) acc = 0.0; - double ncalls = (calls == 0) ? 1 : calls; + double ncalls = (getCalls() == 0) ? 1 : getCalls(); outfile << "\"metrics\": {\"time\": " << excl << ", \"total time (inc)\": " << acc << ", \"time (inc cpu)\": " << (acc / (double)(thread_ids.size())) << ", \"time (inc wall)\": " << inclusive << ", \"num threads\": " << thread_ids.size() - << ", \"min (inc)\": " << min - << ", \"max (inc)\": " << max - << ", \"sumsqr (inc)\": " << sumsqr + << ", \"min (inc)\": " << getMinimum() + << ", \"max (inc)\": " << getMaximum() + << ", \"sumsqr (inc)\": " << getSumSquares() << ", \"calls\": " << ncalls << "}"; // if no children, we are done @@ -251,7 +251,7 @@ void Node::writeTAUCallpath(std::ofstream& outfile, std::string prefix) { if (prefix.size() == 0 && children.size() == 0) { return ; } // get the inclusive amount for this timer - double acc = accumulated * 1000000; // stored in seconds, we need to convert to microseconds + double acc = getAccumulated() * 1000000; // stored in seconds, we need to convert to microseconds // update the prefix if (data->get_name().compare(APEX_MAIN_STR) == 0) { @@ -286,7 +286,7 @@ void Node::writeTAUCallpath(std::ofstream& outfile, std::string prefix) { // otherwise, write out this node outfile << "\"" << prefix << "\" "; // write the number of calls - double ncalls = (calls == 0) ? 1 : calls; + double ncalls = (getCalls() == 0) ? 1 : getCalls(); outfile << std::fixed << std::setprecision(0) << ncalls << " "; // write out subroutines outfile << child_calls << " "; @@ -319,17 +319,64 @@ void Node::addAccumulated(double value, double incl, bool is_resume, uint64_t th static std::mutex m; m.lock(); if (!is_resume) { - calls+=1; + getCalls()+=1; inclusive = inclusive + incl; } - accumulated = accumulated + value; - if (min == 0.0 || value < min) { min = value; } - if (value > max) { max = value; } - sumsqr = sumsqr + (value*value); + getAccumulated() = getAccumulated() + value; + if (getMinimum() == 0.0 || value < getMinimum()) { getMinimum() = value; } + if (value > getMaximum()) { getMaximum() = value; } + getSumSquares() = getSumSquares() + (value*value); thread_ids.insert(thread_id); m.unlock(); } +double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) { + static size_t depth = 0; + APEX_ASSERT(total > 0.0); + // write out the node id and graph node index and the name + outfile << node_id << "," << index << ","; + outfile << ((parent == nullptr) ? 0 : parent->index) << ","; + outfile << depth << ",\""; + outfile << data->get_tree_name() << "\","; + // write out the inclusive + double acc = (data == task_identifier::get_main_task_id() || getAccumulated() == 0.0) ? + total : getAccumulated(); + // write the number of calls + double ncalls = (getCalls() == 0) ? 1 : getCalls(); + outfile << std::fixed << std::setprecision(0) << ncalls << ","; + outfile << thread_ids.size() << ","; + // write other stats - min, max, stddev + double mean = acc / ncalls; + outfile << std::setprecision(9); + outfile << acc << ","; + outfile << getMinimum() << ","; + outfile << mean << ","; + outfile << getMaximum() << ","; + // avoid -0.0 which will cause a -nan for stddev + double variance = std::max(0.0,((getSumSquares() / ncalls) - (mean * mean))); + double stddev = sqrt(variance); + outfile << stddev; + // end the line + outfile << std::endl; + + // sort the children by accumulated time + std::vector > sorted; + for (auto& it : children) { + sorted.push_back(it); + } + sort(sorted.begin(), sorted.end(), cmp); + + // do all the children + double remainder = acc; + depth++; + for (auto c : sorted) { + double tmp = c.second->writeNodeCSV(outfile, total, node_id); + remainder = remainder - tmp; + } + depth--; + return acc; +} + } // dependency_tree } // apex diff --git a/src/apex/dependency_tree.hpp b/src/apex/dependency_tree.hpp index f41db546..4ed6106e 100644 --- a/src/apex/dependency_tree.hpp +++ b/src/apex/dependency_tree.hpp @@ -13,6 +13,7 @@ #include #include #include +#include "apex_types.h" #include "task_identifier.hpp" namespace apex { @@ -24,12 +25,13 @@ class Node { task_identifier* data; Node* parent; size_t count; - double calls; - double accumulated; + apex_profile prof; + //double calls; + //double accumulated; + //double min; + //double max; + //double sumsqr; double inclusive; - double min; - double max; - double sumsqr; size_t index; std::set thread_ids; std::unordered_map children; @@ -37,9 +39,13 @@ class Node { static std::atomic nodeCount; public: Node(task_identifier* id, Node* p) : - data(id), parent(p), count(1), calls(0), accumulated(0), - inclusive(0), min(0), max(0), sumsqr(0), + data(id), parent(p), count(1), inclusive(0), index(nodeCount.fetch_add(1, std::memory_order_relaxed)) { + prof.calls = 0.0; + prof.accumulated = 0.0; + prof.minimum = 0.0; + prof.maximum = 0.0; + prof.sum_squares = 0.0; } ~Node() { treeMutex.lock(); @@ -53,13 +59,17 @@ class Node { task_identifier* getData() { return data; } Node* getParent() { return parent; } size_t getCount() { return count; } - size_t getCalls() { return calls; } - double getAccumulated() { return accumulated; } + inline double& getCalls() { return prof.calls; } + inline double& getAccumulated() { return prof.accumulated; } + inline double& getMinimum() { return prof.minimum; } + inline double& getMaximum() { return prof.maximum; } + inline double& getSumSquares() { return prof.sum_squares; } void addAccumulated(double value, double incl, bool is_resume, uint64_t thread_id); size_t getIndex() { return index; }; std::string getName() { return data->get_name(); }; void writeNode(std::ofstream& outfile, double total); double writeNodeASCII(std::ofstream& outfile, double total, size_t indent); + double writeNodeCSV(std::stringstream& outfile, double total, int node_id); double writeNodeJSON(std::ofstream& outfile, double total, size_t indent); void writeTAUCallpath(std::ofstream& outfile, std::string prefix); static size_t getNodeCount() { diff --git a/src/apex/profile_reducer.cpp b/src/apex/profile_reducer.cpp index 2821f159..64e0994b 100644 --- a/src/apex/profile_reducer.cpp +++ b/src/apex/profile_reducer.cpp @@ -300,21 +300,30 @@ std::map reduce_profiles_for_screen() { // the second string without a null character (zero). max_length = max_length + 1; #endif + // allocate the send buffer + char * sbuf = (char*)calloc(max_length, sizeof(char)); + // copy into the send buffer + strncpy(sbuf, csv_output.str().c_str(), length); // allocate the memory to hold all output char * rbuf = nullptr; if (commrank == 0) { +#if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) rbuf = (char*)calloc(max_length * commsize, sizeof(char)); +#else + rbuf = sbuf; +#endif } - char * sbuf = (char*)calloc(max_length, sizeof(char)); - strncpy(sbuf, csv_output.str().c_str(), length); + +#if !defined(HPX_HAVE_NETWORKING) && defined(APEX_HAVE_MPI) MPI_Gather(sbuf, max_length, MPI_CHAR, rbuf, max_length, MPI_CHAR, 0, MPI_COMM_WORLD); +#endif if (commrank == 0) { std::ofstream csvfile; std::stringstream csvname; csvname << apex_options::output_file_path(); csvname << filesystem_separator() << filename; - std::cout << "Writing: " << csvname.str() << std::endl; + std::cout << "Writing: " << csvname.str(); csvfile.open(csvname.str(), std::ios::out); char * index = rbuf; for (auto i = 0 ; i < commsize ; i++) { @@ -324,6 +333,7 @@ std::map reduce_profiles_for_screen() { csvfile.flush(); } csvfile.close(); + std::cout << "...done." << std::endl; } free(sbuf); free(rbuf); @@ -385,7 +395,7 @@ std::map reduce_profiles_for_screen() { } csv_output << std::endl; } - reduce_profiles(csv_output, "apex_flat_profiles.csv"); + reduce_profiles(csv_output, "apex_profiles.csv"); } } // namespace diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index bc28cca4..8a6dabf4 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -1117,23 +1117,27 @@ std::unordered_set free_profiles; /* before calling parent.get_name(), make sure we create * a thread_instance object that is NOT a worker. */ thread_instance::instance(false); - ofstream myfile; - stringstream dotname; - dotname << apex_options::output_file_path(); - dotname << filesystem_separator() << "tasktree." << node_id << ".dot"; - myfile.open(dotname.str().c_str()); + auto root = task_wrapper::get_apex_main_wrapper(); // our TOTAL available time is the elapsed * the number of threads, or cores - int num_worker_threads = thread_instance::get_num_workers(); auto main_id = task_identifier::get_main_task_id(); profile * total_time = get_profile(*main_id); double wall_clock_main = total_time->get_accumulated_seconds(); + +#if 0 + int num_worker_threads = thread_instance::get_num_workers(); #ifdef APEX_HAVE_HPX num_worker_threads = num_worker_threads - num_non_worker_threads_registered; #endif double total_main = wall_clock_main * fmin(hardware_concurrency(), num_worker_threads); + ofstream myfile; + stringstream dotname; + dotname << apex_options::output_file_path(); + dotname << filesystem_separator() << "tasktree." << node_id << ".dot"; + myfile.open(dotname.str().c_str()); + myfile << "digraph prof {\n"; myfile << " label = \"Start Date/Time: " << timestamp_started; myfile << "\\lElapsed Time: " << wall_clock_main; @@ -1147,7 +1151,6 @@ std::unordered_set free_profiles; myfile << " splines = true;\n"; myfile << " rankdir = \"LR\";\n"; myfile << " node [shape=box];\n"; - auto root = task_wrapper::get_apex_main_wrapper(); // recursively write out the tree root->tree_node->writeNode(myfile, wall_clock_main); myfile << "}\n"; @@ -1166,6 +1169,18 @@ std::unordered_set free_profiles; myfile.open(txtname2.str().c_str()); root->tree_node->writeNodeJSON(myfile, wall_clock_main, 0); myfile.close(); +#endif + // write to a single file! + stringstream tree_stream; + if (node_id == 0) { + tree_stream << "\"process rank\",\"node index\",\"parent index\",\"depth\","; + tree_stream << "\"name\",\"calls\",\"threads\",\"accumulated\","; + tree_stream << "\"minimum\",\"mean\",\"maximum\","; + tree_stream << "\"sumsqr\"\n"; + } + root->tree_node->writeNodeCSV(tree_stream, wall_clock_main, node_id); + std::string filename{"apex_tasktree.csv"}; + reduce_profiles(tree_stream, filename); } /* Write TAU profiles from the collected data. */ From 90532db2f2565795329783b54a6c73c6793de80d Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 31 Jan 2023 20:47:34 -0800 Subject: [PATCH 09/13] Adding general metrics for tasktree nodes! --- src/apex/apex_mpi.cpp | 79 ++++++++++++++++++++++++++++++++++ src/apex/dependency_tree.cpp | 28 ++++++++++++ src/apex/dependency_tree.hpp | 8 ++++ src/apex/memory_wrapper.cpp | 8 ++++ src/apex/memory_wrapper.hpp | 1 + src/apex/profiler.hpp | 2 + src/apex/profiler_listener.cpp | 7 ++- 7 files changed, 132 insertions(+), 1 deletion(-) diff --git a/src/apex/apex_mpi.cpp b/src/apex/apex_mpi.cpp index 13559471..aefd440c 100644 --- a/src/apex/apex_mpi.cpp +++ b/src/apex/apex_mpi.cpp @@ -16,6 +16,7 @@ #endif #include "apex_api.hpp" +#include "memory_wrapper.hpp" #include "apex_error_handling.hpp" #if defined(APEX_HAVE_MPI) || \ (defined(HPX_HAVE_NETWORKING) && defined(HPX_HAVE_PARCELPORT_MPI)) @@ -161,6 +162,31 @@ void _symbol( MPI_Fint *ierr ) { \ apex::sample_value(name, bytes); return bytes; } + inline double getBytesTransferred2(const int count, MPI_Datatype datatype, MPI_Comm comm, const char * function) { + int typesize = 0; + int commsize = 0; + PMPI_Type_size( datatype, &typesize ); + PMPI_Comm_size( comm, &commsize ); + double bytes = (double)(typesize) * (double)(count) * (double)commsize; + std::string name("Bytes : "); + name.append(function); + apex::sample_value(name, bytes); + return bytes; + } + inline double getBytesTransferred3(const int * count, MPI_Datatype datatype, MPI_Comm comm, const char * function) { + int typesize = 0; + int commsize = 0; + PMPI_Type_size( datatype, &typesize ); + PMPI_Comm_size( comm, &commsize ); + double bytes = 0; + for(int i = 0 ; i < commsize ; i++) { + bytes += ((double)(typesize) * (double)(count[i])); + } + std::string name("Bytes : "); + name.append(function); + apex::sample_value(name, bytes); + return bytes; + } inline void getBandwidth(double bytes, std::shared_ptr task, const char * function) { if ((task != nullptr) && (task->prof != nullptr)) { std::string name("BW (Bytes/second) : "); @@ -176,6 +202,7 @@ void _symbol( MPI_Fint *ierr ) { \ double bytes = getBytesTransferred(count, datatype, "MPI_Isend"); /* start the timer */ MPI_START_TIMER + apex::recordMetric("Send Bytes", bytes); /* sample the bytes */ int retval = PMPI_Isend(buf, count, datatype, dest, tag, comm, request); MPI_STOP_TIMER @@ -202,6 +229,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Irecv"); MPI_START_TIMER + apex::recordMetric("Recv Bytes", bytes); int retval = PMPI_Irecv(buf, count, datatype, source, tag, comm, request); MPI_STOP_TIMER @@ -229,6 +257,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou double bytes = getBytesTransferred(count, datatype, "MPI_Send"); /* start the timer */ MPI_START_TIMER + apex::recordMetric("Send Bytes", bytes); /* sample the bytes */ int retval = PMPI_Send(buf, count, datatype, dest, tag, comm); MPI_STOP_TIMER @@ -253,6 +282,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Recv"); MPI_START_TIMER + apex::recordMetric("Recv Bytes", bytes); int retval = PMPI_Recv(buf, count, datatype, source, tag, comm, status); MPI_STOP_TIMER /* record the bandwidth */ @@ -285,7 +315,12 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou } int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Gather sendbuf"); + double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Gather recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); @@ -311,7 +346,12 @@ void _symbol(void * sendbuf, MPI_Fint *sendcnt, MPI_Fint *sendtype, void * recvb int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(count, datatype, "MPI_Allreduce sendbuf"); + double rbytes = getBytesTransferred2(count, datatype, comm, "MPI_Allreduce recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm); MPI_STOP_TIMER @@ -335,7 +375,12 @@ void _symbol(void * sendbuf, void * recvbuf, MPI_Fint *count, MPI_Fint *datatype int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(count, datatype, "MPI_Reduce sendbuf"); + double rbytes = getBytesTransferred2(count, datatype, comm, "MPI_Reduce recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm); MPI_STOP_TIMER @@ -360,7 +405,16 @@ void _symbol(void * sendbuf, void * recvbuf, MPI_Fint *count, MPI_Fint *datatype int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm ) { + //int commrank; + //PMPI_Comm_rank(comm, &commrank); + /* Get the byte count */ + double sbytes = getBytesTransferred(count, datatype, "MPI_Bcast"); MPI_START_TIMER + //if (root == commrank) { + apex::recordMetric("Send Bytes", sbytes); + //} else { + //apex::recordMetric("Recv Bytes", sbytes); + //} apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Bcast(buffer, count, datatype, root, comm ); MPI_STOP_TIMER @@ -409,7 +463,12 @@ void _symbol(MPI_Fint *count, MPI_Fint * array_of_requests, MPI_Fint *ierr) { \ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Alltoall sendbuf"); + double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Alltoall recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); MPI_STOP_TIMER @@ -434,7 +493,12 @@ MPI_Fint *recvcnt, MPI_Fint *recvtype, MPI_Fint *comm, MPI_Fint *ierr) { \ int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Allgather sendbuf"); + double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Allgather recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); MPI_STOP_TIMER @@ -461,7 +525,12 @@ void _symbol(void * sendbuf, MPI_Fint *sendcount, MPI_Fint *sendtype, void * rec int MPI_Allgatherv(const void* buffer_send, int count_send, MPI_Datatype datatype_send, void* buffer_recv, const int* counts_recv, const int* displacements, MPI_Datatype datatype_recv, MPI_Comm communicator) { + /* Get the byte count */ + double sbytes = getBytesTransferred(count_send, datatype_send, "MPI_Allgatherv sendbuf"); + double rbytes = getBytesTransferred3(counts_recv, datatype_recv, communicator, "MPI_Allgatherv recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(communicator, __APEX_FUNCTION__, p); int retval = PMPI_Allgatherv(buffer_send, count_send, datatype_send, buffer_recv, counts_recv, displacements, datatype_recv, communicator); @@ -488,7 +557,12 @@ void _symbol(void * sendbuf, MPI_Fint *sendcount, MPI_Fint *sendtype, void * rec int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm) { + /* Get the byte count */ + double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Gatherv sendbuf"); + double rbytes = getBytesTransferred3(recvcounts, recvtype, comm, "MPI_Gatherv recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm); MPI_STOP_TIMER @@ -514,7 +588,12 @@ void _symbol(void * sendbuf, MPI_Fint *sendcnt, MPI_Fint *sendtype, void * recvb int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, int recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status * status) { + /* Get the byte count */ + double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Sendrecv sendbuf"); + double rbytes = getBytesTransferred(recvcount, recvtype, "MPI_Sendrecv recvbuf"); MPI_START_TIMER + apex::recordMetric("Send Bytes", sbytes); + apex::recordMetric("Recv Bytes", rbytes); apex_measure_mpi_sync(comm, __APEX_FUNCTION__, p); int retval = PMPI_Sendrecv(sendbuf, sendcount, sendtype, dest, sendtag, recvbuf, recvcount, recvtype, source, recvtag, comm, status); diff --git a/src/apex/dependency_tree.cpp b/src/apex/dependency_tree.cpp index 0f058c77..fcc4ea60 100644 --- a/src/apex/dependency_tree.cpp +++ b/src/apex/dependency_tree.cpp @@ -21,6 +21,7 @@ namespace dependency { // declare an instance of the statics std::mutex Node::treeMutex; std::atomic Node::nodeCount{0}; +std::set Node::known_metrics; Node* Node::appendChild(task_identifier* c) { treeMutex.lock(); @@ -356,6 +357,14 @@ double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) double variance = std::max(0.0,((getSumSquares() / ncalls) - (mean * mean))); double stddev = sqrt(variance); outfile << stddev; + // write any available metrics + for (auto& x : known_metrics) { + if (metric_map.find(x) == metric_map.end()) { + outfile << ",0"; + } else { + outfile << "," << metric_map[x]; + } + } // end the line outfile << std::endl; @@ -377,6 +386,25 @@ double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) return acc; } +void Node::addMetrics(std::map& _metric_map) { + static std::mutex m; + for (auto& x: _metric_map) { + std::cout << x.first << " => " << x.second << '\n'; + if (known_metrics.find(x.first) == known_metrics.end()) { + m.lock(); + known_metrics.insert(x.first); + m.unlock(); + } + m.lock(); + if (metric_map.find(x.first) == metric_map.end()) { + metric_map[x.first] = x.second; + } else { + metric_map[x.first] += x.second; + } + m.unlock(); + } +} + } // dependency_tree } // apex diff --git a/src/apex/dependency_tree.hpp b/src/apex/dependency_tree.hpp index 4ed6106e..ef9a1ee3 100644 --- a/src/apex/dependency_tree.hpp +++ b/src/apex/dependency_tree.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include "apex_types.h" #include "task_identifier.hpp" @@ -35,8 +36,11 @@ class Node { size_t index; std::set thread_ids; std::unordered_map children; + // map for arbitrary metrics + std::map metric_map; static std::mutex treeMutex; static std::atomic nodeCount; + static std::set known_metrics; public: Node(task_identifier* id, Node* p) : data(id), parent(p), count(1), inclusive(0), @@ -75,6 +79,10 @@ class Node { static size_t getNodeCount() { return nodeCount; } + void addMetrics(std::map& metric_map); + static std::set& getKnownMetrics() { + return known_metrics; + } }; } // dependency_tree diff --git a/src/apex/memory_wrapper.cpp b/src/apex/memory_wrapper.cpp index 883b5844..297bd452 100644 --- a/src/apex/memory_wrapper.cpp +++ b/src/apex/memory_wrapper.cpp @@ -162,6 +162,14 @@ void recordFree(void* ptr, bool cpu) { if (cpu) sample_value("Memory: Total Bytes Occupied", value); } +/* This doesn't belong here, but whatevs */ +void recordMetric(std::string name, double value) { + profiler * p = thread_instance::instance().get_current_profiler(); + if (p != nullptr) { + p->metric_map[name] = value; + } +} + // Comparator function to sort pairs descending, according to second value bool cmp(std::pair& a, std::pair& b) diff --git a/src/apex/memory_wrapper.hpp b/src/apex/memory_wrapper.hpp index 6e88656b..0b0a6b77 100644 --- a/src/apex/memory_wrapper.hpp +++ b/src/apex/memory_wrapper.hpp @@ -63,6 +63,7 @@ book_t& getBook(void); void printBacktrace(void); void recordAlloc(size_t bytes, void* ptr, allocator_t alloc, bool cpu = true); void recordFree(void* ptr, bool cpu = false); +void recordMetric(std::string name, double value); }; // apex namespace diff --git a/src/apex/profiler.hpp b/src/apex/profiler.hpp index d4df0e1b..536e1b68 100644 --- a/src/apex/profiler.hpp +++ b/src/apex/profiler.hpp @@ -17,6 +17,7 @@ class profiler; #include #include #include +#include #include "apex_options.hpp" #include "apex_types.h" #include "apex_assert.h" @@ -61,6 +62,7 @@ class profiler { bool stopped; // needed for correct Hatchet output uint64_t thread_id; + std::map metric_map; task_identifier * get_task_id(void) { return task_id; } diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index 8a6dabf4..d0605cfb 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -425,6 +425,7 @@ std::unordered_set free_profiles; } if (apex_options::use_tasktree_output() && !p.is_counter && p.tt_ptr != nullptr) { p.tt_ptr->tree_node->addAccumulated(p.elapsed_seconds(), p.inclusive_seconds(), p.is_resume, p.thread_id); + p.tt_ptr->tree_node->addMetrics(p.metric_map); } return 1; } @@ -1176,7 +1177,11 @@ std::unordered_set free_profiles; tree_stream << "\"process rank\",\"node index\",\"parent index\",\"depth\","; tree_stream << "\"name\",\"calls\",\"threads\",\"accumulated\","; tree_stream << "\"minimum\",\"mean\",\"maximum\","; - tree_stream << "\"sumsqr\"\n"; + tree_stream << "\"sumsqr\""; + for (auto& x : dependency::Node::getKnownMetrics()) { + tree_stream << ",\"" << x << "\""; + } + tree_stream << "\n"; } root->tree_node->writeNodeCSV(tree_stream, wall_clock_main, node_id); std::string filename{"apex_tasktree.csv"}; From 56eab9a093fa02b1231223f590b37ff4e65e7406 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 1 Feb 2023 06:36:37 -0800 Subject: [PATCH 10/13] Removing debug message --- src/apex/dependency_tree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/apex/dependency_tree.cpp b/src/apex/dependency_tree.cpp index fcc4ea60..cee5abed 100644 --- a/src/apex/dependency_tree.cpp +++ b/src/apex/dependency_tree.cpp @@ -389,7 +389,6 @@ double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) void Node::addMetrics(std::map& _metric_map) { static std::mutex m; for (auto& x: _metric_map) { - std::cout << x.first << " => " << x.second << '\n'; if (known_metrics.find(x.first) == known_metrics.end()) { m.lock(); known_metrics.insert(x.first); From f530fb3fb67ca2fc4c221b0cacd8da8ce81407d8 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 1 Feb 2023 12:00:48 -0800 Subject: [PATCH 11/13] Adding ability to check whether MPI can accomodate the memory requested for big transfers. Use the APEX_VALIDATE_MPI_MEMORY_USAGE variable to enable it. --- src/apex/apex_mpi.cpp | 55 ++++++++++++++++++++++++++++++++++++++ src/apex/apex_rocm_smi.cpp | 19 +++++++++++++ src/apex/apex_rocm_smi.hpp | 1 + src/apex/apex_types.h | 2 ++ src/apex/proc_read.cpp | 34 +++++++++++++++++++++++ src/apex/proc_read.h | 2 ++ 6 files changed, 113 insertions(+) diff --git a/src/apex/apex_mpi.cpp b/src/apex/apex_mpi.cpp index aefd440c..b472b3f1 100644 --- a/src/apex/apex_mpi.cpp +++ b/src/apex/apex_mpi.cpp @@ -18,6 +18,7 @@ #include "apex_api.hpp" #include "memory_wrapper.hpp" #include "apex_error_handling.hpp" +#include "proc_read.h" #if defined(APEX_HAVE_MPI) || \ (defined(HPX_HAVE_NETWORKING) && defined(HPX_HAVE_PARCELPORT_MPI)) #include "mpi.h" @@ -187,6 +188,21 @@ void _symbol( MPI_Fint *ierr ) { \ apex::sample_value(name, bytes); return bytes; } + inline bool checkAvailableMemory(double bytes_requested) { + std::array available{apex::getAvailableMemory()}; + //std::cout << "Available: " << available[0] << ", " << available[1]; + //std::cout << " Requested: " << bytes_requested << std::endl; + if (bytes_requested > + (apex::apex_options::validate_mpi_memory_usage_fraction()*available[0]) || + bytes_requested > + (apex::apex_options::validate_mpi_memory_usage_fraction()*available[1])) { + std::cerr << "Warning! Requesting too much memory!" << std::endl; + std::cerr << "Expect a failure! Here's a backtrace:" << std::endl; + apex_print_backtrace(); + return false; + } + return true; + } inline void getBandwidth(double bytes, std::shared_ptr task, const char * function) { if ((task != nullptr) && (task->prof != nullptr)) { std::string name("BW (Bytes/second) : "); @@ -200,6 +216,9 @@ void _symbol( MPI_Fint *ierr ) { \ int tag, MPI_Comm comm, MPI_Request *request) { /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Isend"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(bytes); + } /* start the timer */ MPI_START_TIMER apex::recordMetric("Send Bytes", bytes); @@ -229,6 +248,9 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Irecv"); MPI_START_TIMER + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(bytes); + } apex::recordMetric("Recv Bytes", bytes); int retval = PMPI_Irecv(buf, count, datatype, source, tag, comm, request); @@ -255,6 +277,9 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou int tag, MPI_Comm comm){ /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Send"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(bytes); + } /* start the timer */ MPI_START_TIMER apex::recordMetric("Send Bytes", bytes); @@ -281,6 +306,9 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des int source, int tag, MPI_Comm comm, MPI_Status *status){ /* Get the byte count */ double bytes = getBytesTransferred(count, datatype, "MPI_Recv"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(bytes); + } MPI_START_TIMER apex::recordMetric("Recv Bytes", bytes); int retval = PMPI_Recv(buf, count, datatype, source, tag, comm, status); @@ -318,6 +346,9 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou /* Get the byte count */ double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Gather sendbuf"); double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Gather recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -349,6 +380,9 @@ void _symbol(void * sendbuf, MPI_Fint *sendcnt, MPI_Fint *sendtype, void * recvb /* Get the byte count */ double sbytes = getBytesTransferred(count, datatype, "MPI_Allreduce sendbuf"); double rbytes = getBytesTransferred2(count, datatype, comm, "MPI_Allreduce recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -378,6 +412,9 @@ void _symbol(void * sendbuf, void * recvbuf, MPI_Fint *count, MPI_Fint *datatype /* Get the byte count */ double sbytes = getBytesTransferred(count, datatype, "MPI_Reduce sendbuf"); double rbytes = getBytesTransferred2(count, datatype, comm, "MPI_Reduce recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -409,6 +446,9 @@ void _symbol(void * sendbuf, void * recvbuf, MPI_Fint *count, MPI_Fint *datatype //PMPI_Comm_rank(comm, &commrank); /* Get the byte count */ double sbytes = getBytesTransferred(count, datatype, "MPI_Bcast"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes); + } MPI_START_TIMER //if (root == commrank) { apex::recordMetric("Send Bytes", sbytes); @@ -466,6 +506,9 @@ void _symbol(MPI_Fint *count, MPI_Fint * array_of_requests, MPI_Fint *ierr) { \ /* Get the byte count */ double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Alltoall sendbuf"); double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Alltoall recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -496,6 +539,9 @@ MPI_Fint *recvcnt, MPI_Fint *recvtype, MPI_Fint *comm, MPI_Fint *ierr) { \ /* Get the byte count */ double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Allgather sendbuf"); double rbytes = getBytesTransferred2(recvcount, recvtype, comm, "MPI_Allgather recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -528,6 +574,9 @@ void _symbol(void * sendbuf, MPI_Fint *sendcount, MPI_Fint *sendtype, void * rec /* Get the byte count */ double sbytes = getBytesTransferred(count_send, datatype_send, "MPI_Allgatherv sendbuf"); double rbytes = getBytesTransferred3(counts_recv, datatype_recv, communicator, "MPI_Allgatherv recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -560,6 +609,9 @@ void _symbol(void * sendbuf, MPI_Fint *sendcount, MPI_Fint *sendtype, void * rec /* Get the byte count */ double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Gatherv sendbuf"); double rbytes = getBytesTransferred3(recvcounts, recvtype, comm, "MPI_Gatherv recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); @@ -591,6 +643,9 @@ void _symbol(void * sendbuf, MPI_Fint *sendcnt, MPI_Fint *sendtype, void * recvb /* Get the byte count */ double sbytes = getBytesTransferred(sendcount, sendtype, "MPI_Sendrecv sendbuf"); double rbytes = getBytesTransferred(recvcount, recvtype, "MPI_Sendrecv recvbuf"); + if (apex::apex_options::validate_mpi_memory_usage()) { + checkAvailableMemory(sbytes+rbytes); + } MPI_START_TIMER apex::recordMetric("Send Bytes", sbytes); apex::recordMetric("Recv Bytes", rbytes); diff --git a/src/apex/apex_rocm_smi.cpp b/src/apex/apex_rocm_smi.cpp index 543c48d4..44d49654 100644 --- a/src/apex/apex_rocm_smi.cpp +++ b/src/apex/apex_rocm_smi.cpp @@ -353,5 +353,24 @@ void monitor::activateDeviceIndex(uint32_t index) { indexMutex.unlock(); } +double monitor::getAvailableMemory() { + double avail{0}; + indexMutex.lock(); + // use the copy constructor to get the set of active indices + std::set indexSet{activeDeviceIndices}; + indexMutex.unlock(); + /* just check the first known device for now, assume 1 */ + for (uint32_t d : indexSet) { + uint64_t memory_total; + RSMI_CALL(rsmi_dev_memory_total_get(d, RSMI_MEM_TYPE_VRAM, &memory_total)); + uint64_t memory_usage; + RSMI_CALL(rsmi_dev_memory_usage_get(d, RSMI_MEM_TYPE_VRAM, &memory_usage)); + avail = (double)(memory_total - memory_usage); + break; + } + + return avail; +} + } // namespace rsmi } // namespace apex diff --git a/src/apex/apex_rocm_smi.hpp b/src/apex/apex_rocm_smi.hpp index 50a1e6ce..eac79f74 100644 --- a/src/apex/apex_rocm_smi.hpp +++ b/src/apex/apex_rocm_smi.hpp @@ -62,6 +62,7 @@ class monitor { void query(); void stop(); void activateDeviceIndex(uint32_t index); + double getAvailableMemory(); private: bool success; uint32_t deviceCount; diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index 07a07008..edd5fa09 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -355,9 +355,11 @@ inline unsigned int sc_nprocessors_onln() macro (APEX_START_DELAY_SECONDS, start_delay_seconds, int, 0, "Delay collection of APEX data for N seconds.") \ macro (APEX_MAX_DURATION_SECONDS, max_duration_seconds, int, 0, "Collect APEX data for only N seconds.") \ macro (APEX_USE_SHORT_TASK_NAMES, use_short_task_names, bool, false, "") \ + macro (APEX_VALIDATE_MPI_MEMORY_USAGE, validate_mpi_memory_usage, bool, false, "") \ #define FOREACH_APEX_FLOAT_OPTION(macro) \ macro (APEX_SCATTERPLOT_FRACTION, scatterplot_fraction, double, 0.01, "Fraction of kernel executions to include on scatterplot.") \ + macro (APEX_VALIDATE_MPI_MEMORY_USAGE_FRACTION, validate_mpi_memory_usage_fraction, double, 1.0, "") \ #define FOREACH_APEX_STRING_OPTION(macro) \ macro (APEX_PAPI_METRICS, papi_metrics, char*, "", "PAPI metrics requested, separated by spaces.") \ diff --git a/src/apex/proc_read.cpp b/src/apex/proc_read.cpp index 1834f2d3..d9e64042 100644 --- a/src/apex/proc_read.cpp +++ b/src/apex/proc_read.cpp @@ -66,6 +66,10 @@ using namespace std; namespace apex { +#ifdef APEX_WITH_HIP +rsmi::monitor * global_rsmi_reader = nullptr; +#endif + void get_popen_data(char *cmnd) { FILE *pf; string command; @@ -834,6 +838,7 @@ namespace apex { // If PAPI support is lacking, use our own support if (apex_options::monitor_gpu()) { rsmi_reader = new rsmi::monitor(); + global_rsmi_reader = rsmi_reader; rsmi_reader->query(); } rocprofiler::monitor * rocprof_reader; @@ -992,6 +997,35 @@ namespace apex { return line; } +std::array getAvailableMemory() { + std::array values{0,0}; + /* Get the CPU memory */ + FILE *f = fopen("/proc/meminfo", "r"); + if (f) { + char line[4096] = {0}; + while ( fgets( line, 4096, f)) { + string tmp(line); + const REGEX_NAMESPACE::regex separator(":"); + REGEX_NAMESPACE::sregex_token_iterator token(tmp.begin(), tmp.end(), + separator, -1); + REGEX_NAMESPACE::sregex_token_iterator end; + string name = *token++; + if (token != end && name.find("MemFree") != name.npos) { + string value = *token; + char* pEnd; + double d1 = strtod (value.c_str(), &pEnd); + if (pEnd) { values[0] = d1; } + break; + } + } + fclose(f); + } + if (global_rsmi_reader != nullptr) { + values[1] = global_rsmi_reader->getAvailableMemory(); + } + return values; +} + } // namespace #endif // APEX_HAVE_PROC diff --git a/src/apex/proc_read.h b/src/apex/proc_read.h index 7185095f..2ffd65ac 100644 --- a/src/apex/proc_read.h +++ b/src/apex/proc_read.h @@ -236,6 +236,8 @@ void apex_finalize_msr(void); double msr_current_power_high(void); #endif +std::array getAvailableMemory(); + } #endif From b23673cc7a13b785d65476f8a5a7f3c6205afbe1 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Fri, 3 Feb 2023 15:52:57 -0800 Subject: [PATCH 12/13] More advanced statistics for MPI bytes in the tasktree data, including min, max, mode, median, stddev --- src/apex/dependency_tree.cpp | 48 +++++++++++++++++++++++++++++----- src/apex/dependency_tree.hpp | 26 +++++++++++++++++- src/apex/profiler_listener.cpp | 14 +++++++--- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/src/apex/dependency_tree.cpp b/src/apex/dependency_tree.cpp index cee5abed..427148b1 100644 --- a/src/apex/dependency_tree.cpp +++ b/src/apex/dependency_tree.cpp @@ -360,9 +360,39 @@ double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) // write any available metrics for (auto& x : known_metrics) { if (metric_map.find(x) == metric_map.end()) { - outfile << ",0"; + outfile << ",,,,,,,"; } else { - outfile << "," << metric_map[x]; + const auto& value = metric_map.find(x); + const auto& p = value->second.prof; + outfile << "," << p.accumulated; + outfile << "," << p.minimum; + double mean = p.accumulated/ncalls; + outfile << "," << mean; + outfile << "," << p.maximum; + // compute the standard deviation + double t1 = p.sum_squares / ncalls; + double t2 = mean * mean; + double t3 = t1 - t2; + variance = std::max(0.0,(t3)); + stddev = sqrt(variance); + outfile << "," << stddev; + // find the median + auto& d = value->second.distribution; + // how many do we have? + size_t total = 0; + double mode = 0; + double median = 0; + size_t half = (size_t)(ncalls/2.0); + size_t most = 0; + for (auto& node : d) { + total += node.second; + if (total >= half) { median = node.first; break; } + } + for (auto& node : d) { + if (node.second > most) { mode = node.first; } + } + outfile << "," << median; + outfile << "," << mode; } } // end the line @@ -389,16 +419,20 @@ double Node::writeNodeCSV(std::stringstream& outfile, double total, int node_id) void Node::addMetrics(std::map& _metric_map) { static std::mutex m; for (auto& x: _metric_map) { - if (known_metrics.find(x.first) == known_metrics.end()) { + std::string name{x.first}; + double value{x.second}; + if (known_metrics.find(name) == known_metrics.end()) { m.lock(); - known_metrics.insert(x.first); + known_metrics.insert(name); m.unlock(); } m.lock(); - if (metric_map.find(x.first) == metric_map.end()) { - metric_map[x.first] = x.second; + if (metric_map.find(name) == metric_map.end()) { + metricStorage newval(value); + metric_map.emplace(name, std::move(newval)); } else { - metric_map[x.first] += x.second; + auto element = metric_map.find(name); + element->second.increment(value); } m.unlock(); } diff --git a/src/apex/dependency_tree.hpp b/src/apex/dependency_tree.hpp index ef9a1ee3..86e8e75d 100644 --- a/src/apex/dependency_tree.hpp +++ b/src/apex/dependency_tree.hpp @@ -21,6 +21,30 @@ namespace apex { namespace dependency { +class metricStorage { +public: + apex_profile prof; + std::map distribution; + metricStorage(double value) { + prof.accumulated = value; + prof.maximum = value; + prof.minimum = value; + prof.sum_squares = value*value; + distribution[value] = 1; + } + void increment(double value) { + prof.accumulated += value; + prof.maximum = std::max(prof.maximum, value); + prof.minimum = std::min(prof.minimum, value); + prof.sum_squares += value*value; + if (distribution.find(value) == distribution.end()) { + distribution[value] = 1; + } else { + distribution[value] += 1; + } + } +}; + class Node { private: task_identifier* data; @@ -37,7 +61,7 @@ class Node { std::set thread_ids; std::unordered_map children; // map for arbitrary metrics - std::map metric_map; + std::map metric_map; static std::mutex treeMutex; static std::atomic nodeCount; static std::set known_metrics; diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index d0605cfb..6952cf9c 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -1175,11 +1175,17 @@ std::unordered_set free_profiles; stringstream tree_stream; if (node_id == 0) { tree_stream << "\"process rank\",\"node index\",\"parent index\",\"depth\","; - tree_stream << "\"name\",\"calls\",\"threads\",\"accumulated\","; - tree_stream << "\"minimum\",\"mean\",\"maximum\","; - tree_stream << "\"sumsqr\""; + tree_stream << "\"name\",\"calls\",\"threads\",\"total time(ns)\","; + tree_stream << "\"minimum time(ns)\",\"mean time(ns)\",\"maximum time(ns)\","; + tree_stream << "\"stddev time(ns)\""; for (auto& x : dependency::Node::getKnownMetrics()) { - tree_stream << ",\"" << x << "\""; + tree_stream << ",\"total " << x << "\""; + tree_stream << ",\"minimum " << x << "\""; + tree_stream << ",\"mean " << x << "\""; + tree_stream << ",\"maximum " << x << "\""; + tree_stream << ",\"stddev " << x << "\""; + tree_stream << ",\"median " << x << "\""; + tree_stream << ",\"mode " << x << "\""; } tree_stream << "\n"; } From bc2444932abb5b35af6ab079c742cf18dfd7fba8 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Fri, 3 Feb 2023 16:07:52 -0800 Subject: [PATCH 13/13] Removing BW computation during run, what's the point of adding overhead? --- src/apex/apex_mpi.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apex/apex_mpi.cpp b/src/apex/apex_mpi.cpp index b472b3f1..d5dbc92c 100644 --- a/src/apex/apex_mpi.cpp +++ b/src/apex/apex_mpi.cpp @@ -226,7 +226,7 @@ void _symbol( MPI_Fint *ierr ) { \ int retval = PMPI_Isend(buf, count, datatype, dest, tag, comm, request); MPI_STOP_TIMER /* record the bandwidth */ - getBandwidth(bytes, p, "MPI_Isend"); + //getBandwidth(bytes, p, "MPI_Isend"); return retval; } #define APEX_MPI_ISEND_TEMPLATE(_symbol) \ @@ -256,7 +256,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des request); MPI_STOP_TIMER /* record the bandwidth */ - getBandwidth(bytes, p, "MPI_Irecv"); + //getBandwidth(bytes, p, "MPI_Irecv"); return retval; } #define APEX_MPI_IRECV_TEMPLATE(_symbol) \ @@ -287,7 +287,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * sou int retval = PMPI_Send(buf, count, datatype, dest, tag, comm); MPI_STOP_TIMER /* record the bandwidth */ - getBandwidth(bytes, p, "MPI_Send"); + //getBandwidth(bytes, p, "MPI_Send"); return retval; } #define APEX_MPI_SEND_TEMPLATE(_symbol) \ @@ -314,7 +314,7 @@ void _symbol( void * buf, MPI_Fint * count, MPI_Fint * datatype, MPI_Fint * des int retval = PMPI_Recv(buf, count, datatype, source, tag, comm, status); MPI_STOP_TIMER /* record the bandwidth */ - getBandwidth(bytes, p, "MPI_Recv"); + //getBandwidth(bytes, p, "MPI_Recv"); return retval; } #define APEX_MPI_RECV_TEMPLATE(_symbol) \