diff --git a/.gitmodules b/.gitmodules index 0b1ce3d0..2aecf69e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "external/timemory"] path = external/timemory - url = https://github.com/NERSC/timemory.git + url = https://github.com/ROCm/timemory.git [submodule "external/perfetto"] path = external/perfetto url = https://github.com/google/perfetto.git diff --git a/README.md b/README.md index 3aa7f274..e9c2ec47 100755 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ export LD_LIBRARY_PATH=/opt/rocprofiler-systems/lib:${LD_LIBRARY_PATH} Generate a rocprofiler-systems configuration file using `rocprof-sys-avail -G rocprof-sys.cfg`. Optionally, use `rocprof-sys-avail -G rocprof-sys.cfg --all` for a verbose configuration file with descriptions, categories, etc. Modify the configuration file as desired, e.g. enable -[perfetto](https://perfetto.dev/), [timemory](https://github.com/NERSC/timemory), sampling, and process-level sampling by default +[perfetto](https://perfetto.dev/), [timemory](https://github.com/ROCm/timemory), sampling, and process-level sampling by default and tweak some sampling default values: ```console diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index f3383505..eadbfe03 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -677,7 +677,7 @@ mark_as_advanced(TIMEMORY_PROJECT_NAME) rocprofiler_systems_checkout_git_submodule( RELATIVE_PATH external/timemory WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} - REPO_URL https://github.com/NERSC/timemory.git + REPO_URL https://github.com/ROCm/timemory.git REPO_BRANCH omnitrace) rocprofiler_systems_save_variables( diff --git a/docs/conceptual/rocprof-sys-feature-set.rst b/docs/conceptual/rocprof-sys-feature-set.rst index b0502ea6..b26e8f13 100644 --- a/docs/conceptual/rocprof-sys-feature-set.rst +++ b/docs/conceptual/rocprof-sys-feature-set.rst @@ -7,7 +7,7 @@ The ROCm Systems Profiler feature set and use cases *************************************** `ROCm Systems Profiler `_ is designed to be highly extensible. -Internally, it leverages the `Timemory performance analysis toolkit `_ +Internally, it leverages the `Timemory performance analysis toolkit `_ to manage extensions, resources, data, and other items. It supports the following features, modes, metrics, and APIs. diff --git a/docs/how-to/configuring-runtime-options.rst b/docs/how-to/configuring-runtime-options.rst index 99467b13..bc816883 100644 --- a/docs/how-to/configuring-runtime-options.rst +++ b/docs/how-to/configuring-runtime-options.rst @@ -28,7 +28,7 @@ use the ``rocprof-sys-avail -G ~/.rocprof-sys.cfg --all`` option for a verbose configuration file with descriptions, categories, and additional information. Modify ``${HOME}/.rocprof-sys.cfg`` as required. For example, enable `Perfetto `_, -`Timemory `_, sampling, and process-level sampling by default +`Timemory `_, sampling, and process-level sampling by default and tweak the default sampling values. .. code-block:: shell @@ -64,7 +64,7 @@ accepts a case insensitive match for nearly all common Boolean logic expressions Exploring components ----------------------------------- -ROCm Systems Profiler uses `Timemory `_ extensively to provide +ROCm Systems Profiler uses `Timemory `_ extensively to provide various capabilities and manage data and resources. By default, with ``ROCPROFSYS_PROFILE=ON``, ROCm Systems Profiler only collects wall-clock timing values. However, by modifying the ``ROCPROFSYS_TIMEMORY_COMPONENTS`` setting, diff --git a/examples/openmp/CMakeLists.txt b/examples/openmp/CMakeLists.txt index a550dd17..496c7d64 100644 --- a/examples/openmp/CMakeLists.txt +++ b/examples/openmp/CMakeLists.txt @@ -56,3 +56,5 @@ if(ROCPROFSYS_INSTALL_EXAMPLES) DESTINATION bin COMPONENT rocprofiler-systems-examples) endif() + +add_subdirectory(target) diff --git a/examples/openmp/target/CMakeLists.txt b/examples/openmp/target/CMakeLists.txt new file mode 100644 index 00000000..6ed9c5a0 --- /dev/null +++ b/examples/openmp/target/CMakeLists.txt @@ -0,0 +1,108 @@ +# +# +# +cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR) + +# try to find a compatible HIP version +if(ROCmVersion_MAJOR_VERSION AND ROCmVersion_MAJOR_VERSION GREATER_EQUAL 6) + find_package(hip ${ROCmVersion_MAJOR_VERSION}.0.0) +else() + find_package(hip) +endif() + +if(NOT hip_FOUND) + message(WARNING "ROCm >= 5.6 not found. Skipping OpenMP target example.") + return() +elseif(hip_FOUND AND hip_VERSION VERSION_LESS 5.6.0) + message( + WARNING + "ROCm >= 5.6 not found (found ${hip_VERSION}). Skipping OpenMP target example." + ) + return() +endif() + +if(NOT OMP_TARGET_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(OMP_TARGET_COMPILER + "${amdclangpp_EXECUTABLE}" + CACHE FILEPATH "OpenMP target compiler") + else() + message(WARNING "OpenMP target compiler not found. Skipping this example.") + return() + endif() +endif() + +project(rocprofiler-systems-example-openmp-target-lib LANGUAGES CXX) + +set(CMAKE_BUILD_TYPE "RelWithDebInfo") + +set(DEFAULT_GPU_TARGETS + "gfx900" + "gfx906" + "gfx908" + "gfx90a" + "gfx940" + "gfx941" + "gfx942" + "gfx1030" + "gfx1010" + "gfx1100" + "gfx1101" + "gfx1102") + +set(GPU_TARGETS + "${DEFAULT_GPU_TARGETS}" + CACHE STRING "GPU targets to compile for") + +find_package(Threads REQUIRED) + +add_library(openmp-target-lib SHARED) +target_sources(openmp-target-lib PRIVATE library.cpp) +target_link_libraries(openmp-target-lib PUBLIC Threads::Threads) +target_compile_options(openmp-target-lib PRIVATE -fopenmp -ggdb) +target_link_options(openmp-target-lib PUBLIC -fopenmp) + +foreach(_TARGET ${GPU_TARGETS}) + target_compile_options(openmp-target-lib PRIVATE --offload-arch=${_TARGET}) + target_link_options(openmp-target-lib PUBLIC --offload-arch=${_TARGET}) +endforeach() + +message(STATUS "Using OpenMP target compiler: ${OMP_TARGET_COMPILER}") + +get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER} PATH) +get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER_DIR} PATH) + +message(STATUS "Using OpemMP target compiler directory: ${OMP_TARGET_COMPILER_DIR}") + +if(NOT EXISTS ${OMP_TARGET_COMPILER_DIR}/llvm/lib) + message(FATAL_ERROR "${OMP_TARGET_COMPILER_DIR}/llvm/lib does not exist") +endif() +set_target_properties( + openmp-target-lib + PROPERTIES BUILD_RPATH + "${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib" + OUTPUT_NAME "openmp-target" + POSITION_INDEPENDENT_CODE ON) + +rocprofiler_systems_custom_compilation(TARGET openmp-target-lib COMPILER ${OMP_TARGET_COMPILER}) + +add_executable(openmp-target) +target_sources(openmp-target PRIVATE main.cpp) +target_link_libraries(openmp-target PRIVATE openmp-target-lib) +target_compile_options(openmp-target PRIVATE -ggdb) + +set_target_properties( + openmp-target + PROPERTIES BUILD_RPATH + "${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib" + POSITION_INDEPENDENT_CODE ON) + +rocprofiler_systems_custom_compilation(TARGET openmp-target COMPILER ${OMP_TARGET_COMPILER}) diff --git a/examples/openmp/target/library.cpp b/examples/openmp/target/library.cpp new file mode 100644 index 00000000..580721df --- /dev/null +++ b/examples/openmp/target/library.cpp @@ -0,0 +1,149 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr float EPS_FLOAT = 1.0e-7f; +constexpr double EPS_DOUBLE = 1.0e-15; + +#pragma omp declare target +template +T +mul(T a, T b) +{ + T c; + c = a * b; + return c; +} +#pragma omp end declare target + +template +void +vmul(T* a, T* b, T* c, int N) +{ +#pragma omp target map(to : a [0:N], b [0:N]) map(from : c [0:N]) +#pragma omp teams distribute parallel for + for(int i = 0; i < N; i++) + { + c[i] = mul(a[i], b[i]); + } +} + +int +run_impl() +{ + std::this_thread::sleep_for(std::chrono::milliseconds{ 50 }); + + constexpr int N = 100000; + constexpr int Nc = N / 100; + int a_i[N], b_i[N], c_i[N], validate_i[N]; + float a_f[N], b_f[N], c_f[N], validate_f[N]; + double a_d[N], b_d[N], c_d[N], validate_d[N]; + int N_errors = 0; + bool flag = false; + +#pragma omp parallel for schedule(dynamic, Nc) + for(int i = 0; i < N; ++i) + { + a_f[i] = a_i[i] = i + 1; + b_f[i] = b_i[i] = i + 2; + a_d[i] = a_i[i]; + b_d[i] = b_i[i]; + validate_i[i] = a_i[i] * b_i[i]; + validate_f[i] = a_f[i] * b_f[i]; + validate_d[i] = a_d[i] * b_d[i]; + } + + for(int i = 0; i < 2; ++i) + { + vmul(a_i, b_i, c_i, N); + vmul(a_f, b_f, c_f, N); + vmul(a_d, b_d, c_d, N); + } + + for(int i = 0; i < N; i++) + { + if(c_i[i] != validate_i[i]) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf("First fail: c_i[%d](%d) != validate_i[%d](%d)\n", i, c_i[i], i, + validate_i[i]); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_f[i] - validate_f[i]) > EPS_FLOAT) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf("First fail: c_f[%d](%f) != validate_f[%d](%f)\n", i, + static_cast(c_f[i]), i, + static_cast(validate_f[i])); + flag = true; + } + } + } + flag = false; + for(int i = 0; i < N; i++) + { + if(fabs(c_d[i] - validate_d[i]) > EPS_DOUBLE) + { + ++N_errors; + // print 1st bad index + if(!flag) + { + printf("First fail: c_d[%d](%f) != validate_d[%d](%f)\n", i, c_d[i], i, + validate_d[i]); + flag = true; + } + } + } + + return N_errors; +} + +int +run() +{ +#pragma omp parallel + { + run_impl(); + } + + return 0; +} diff --git a/examples/openmp/target/main.cpp b/examples/openmp/target/main.cpp new file mode 100644 index 00000000..081c03b9 --- /dev/null +++ b/examples/openmp/target/main.cpp @@ -0,0 +1,52 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include + +extern int +run(); + +int +main() +{ + auto N_errors = run(); + auto _ec = EXIT_SUCCESS; + if(N_errors == 0) + { + printf("Success\n"); + } + else + { + printf("Total %d failures\n", N_errors); + printf("Fail\n"); + _ec = EXIT_FAILURE; + } + + return _ec; +} diff --git a/external/timemory b/external/timemory index 9ce43f32..68ce4200 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 9ce43f32939dd71aa8905f5c72537e9a3ffcc7c2 +Subproject commit 68ce420086bfd73cbf0986c5ad10d811c8934f78 diff --git a/source/lib/rocprof-sys/library/ompt.cpp b/source/lib/rocprof-sys/library/ompt.cpp index 6cb66c38..f2396158 100644 --- a/source/lib/rocprof-sys/library/ompt.cpp +++ b/source/lib/rocprof-sys/library/ompt.cpp @@ -30,27 +30,171 @@ #if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0 +# include "binary/link_map.hpp" # include "core/components/fwd.hpp" # include "library/components/category_region.hpp" +# include "library/tracing.hpp" # include +# include +# include +# include # include +# include # include # include +# include +# include +# include +# include +# include +# include # include +# include +# include -using api_t = TIMEMORY_API; -using ompt_handle_t = tim::component::ompt_handle; -using ompt_context_t = tim::openmp::context_handler; -using ompt_toolset_t = typename ompt_handle_t::toolset_type; -using ompt_bundle_t = tim::component_tuple; +using api_t = tim::project::rocprofsys; -extern "C" +namespace rocprofsys { - ompt_start_tool_result_t* ompt_start_tool(unsigned int, - const char*) ROCPROFSYS_PUBLIC_API; -} +namespace component +{ +struct ompt : comp::base +{ + using value_type = void; + using base_type = comp::base; + using context_info_t = tim::openmp::context_info; + + static std::string label() { return "ompt"; } + static std::string description() { return "OpenMP tools tracing"; } + + ompt() = default; + ~ompt() = default; + ompt(const ompt&) = default; + ompt(ompt&&) noexcept = default; + + ompt& operator=(const ompt&) = default; + ompt& operator=(ompt&&) noexcept = default; + + template + void start(const context_info_t& _ctx_info, Args&&...) const + { + category_region::start(m_prefix); + + auto _ts = tracing::now(); + uint64_t _cid = + (_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0; + auto _annotate = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _ts); + for(const auto& itr : _ctx_info.arguments) + tracing::add_perfetto_annotation(ctx, itr.label, itr.value); + } + }; + + if(_cid > 0) + { + category_region::start( + (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, + ::perfetto::Flow::ProcessScoped(_cid), std::move(_annotate)); + } + else + { + category_region::start( + (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, + std::move(_annotate)); + } + } + + template + void stop(const context_info_t& _ctx_info, Args&&...) const + { + category_region::stop(m_prefix); + + auto _ts = tracing::now(); + uint64_t _cid = + (_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0; + auto _annotate = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "end_ns", _ts); + for(const auto& itr : _ctx_info.arguments) + tracing::add_perfetto_annotation(ctx, itr.label, itr.value); + } + }; + + if(_cid > 0) + { + category_region::stop( + (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, + std::move(_annotate)); + } + else + { + category_region::stop( + (_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts, + std::move(_annotate)); + } + } + + template + void store(const context_info_t& _ctx_info, Args&&... _args) const + { + start(_ctx_info, std::forward(_args)...); + stop(_ctx_info, std::forward(_args)...); + } + + static void record(std::string_view name, ompt_id_t id, uint64_t beg_time, + uint64_t end_time, uint64_t thrd_id, uint64_t targ_id, + const context_info_t& common) + { + (void) thrd_id; + (void) targ_id; + + auto _annotate = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + for(const auto& itr : common.arguments) + tracing::add_perfetto_annotation(ctx, itr.label, itr.value); + } + }; + + auto _track = tracing::get_perfetto_track( + category::ompt{}, + [](uint64_t _targ_id_v) { + return ::timemory::join::join("", "OMP Target ", _targ_id_v); + }, + targ_id); + + category_region::start( + name, _track, beg_time, ::perfetto::Flow::ProcessScoped(id), + std::move(_annotate)); + + category_region::stop(name, _track, + end_time); + } + + void set_prefix(std::string_view _v) { m_prefix = _v; } + +private: + std::string_view m_prefix = {}; +}; +} // namespace component +} // namespace rocprofsys + +namespace tim +{ +namespace trait +{ +template <> +struct ompt_handle +{ + using type = component_tuple<::rocprofsys::component::ompt>; +}; +} // namespace trait +} // namespace tim namespace rocprofsys { @@ -58,6 +202,11 @@ namespace ompt { namespace { +using ompt_handle_t = tim::component::ompt_handle; +using ompt_context_t = tim::openmp::context_handler; +using ompt_toolset_t = typename ompt_handle_t::toolset_type; +using ompt_bundle_t = tim::component_tuple; + std::unique_ptr f_bundle = {}; bool _init_toolset_off = (trait::runtime_enabled::set(false), trait::runtime_enabled::set(false), true); @@ -70,10 +219,7 @@ setup() if(!tim::settings::enabled()) return; trait::runtime_enabled::set(true); trait::runtime_enabled::set(true); - comp::user_ompt_bundle::global_init(); - comp::user_ompt_bundle::reset(); tim::auto_lock_t lk{ tim::type_mutex() }; - comp::user_ompt_bundle::configure>(); f_bundle = std::make_unique("rocprofsys/ompt", quirk::config{}); } @@ -91,10 +237,15 @@ shutdown() ompt_context_t::cleanup(); trait::runtime_enabled::set(false); trait::runtime_enabled::set(false); - comp::user_ompt_bundle::reset(); pthread_gotcha::shutdown(); // call the OMPT finalize callback - if(f_finalize) (*f_finalize)(); + if(f_finalize) + { + for(const auto& itr : tim::openmp::get_ompt_device_functions()) + if(itr.second.stop_trace) itr.second.stop_trace(itr.second.device); + (*f_finalize)(); + f_finalize = nullptr; + } } f_bundle.reset(); _protect = false; @@ -115,21 +266,231 @@ tool_initialize(ompt_function_lookup_t lookup, int initial_device_num, { if(!rocprofsys::settings_are_configured()) { - ROCPROFSYS_BASIC_WARNING( + ROCPROFSYS_BASIC_WARNING_F( 0, "[%s] invoked before rocprof-sys was initialized. In instrumentation mode, " "settings exported to the environment have not been propagated yet...\n", __FUNCTION__); - rocprofsys::configure_settings(); + use_tool() = get_env("ROCPROFSYS_USE_OMPT", true, false); + } + else + { + use_tool() = rocprofsys::config::get_use_ompt(); } - use_tool() = rocprofsys::config::get_use_ompt(); if(use_tool()) { - TIMEMORY_PRINTF(stderr, "OpenMP-tools configuring for initial device %i\n\n", - initial_device_num); - f_finalize = tim::ompt::configure( - lookup, initial_device_num, tool_data); + ROCPROFSYS_BASIC_VERBOSE_F(2, "OpenMP-tools configuring for initial device %i\n\n", + initial_device_num); + + static auto _generate_key = [](std::string_view _key_v, + const ::tim::openmp::argument_array_t& _args_v) { + return std::string{ _key_v }; + (void) _args_v; + }; + + tim::openmp::get_codeptr_ra_resolver() = + [](tim::openmp::context_info& _ctx_info) { + const auto& _key = _ctx_info.label; + const auto* codeptr_ra = _ctx_info.codeptr_ra; + auto& _args = _ctx_info.arguments; + + ROCPROFSYS_BASIC_VERBOSE(2, "resolving codeptr return address for %s\n", + _key.data()); + + if(!codeptr_ra) return _generate_key(_key, _args); + + static thread_local auto _once = std::once_flag{}; + std::call_once(_once, []() { ::tim::unwind::update_file_maps(); }); + + auto _info = ::rocprofsys::binary::lookup_ipaddr_entry( + reinterpret_cast(codeptr_ra)); + + if(_info) + { + _ctx_info.func = tim::demangle(_info->name); + if(_info->lineno > 0) + { + auto _linfo = _info->lineinfo.rget([](const auto& _v) -> bool { + return (_v && !_v.location.empty() && _v.line > 0); + }); + + if(_linfo) + { + _ctx_info.file = _linfo.location; + _ctx_info.line = _linfo.line; + _args.emplace_back("file", _ctx_info.file); + _args.emplace_back("lineinfo", + ::timemory::join::join("@", _ctx_info.file, + _ctx_info.line)); + } + else + { + _ctx_info.file = _info->location; + _args.emplace_back("file", _ctx_info.file); + } + + return _generate_key( + ::timemory::join::join(" @ ", _key, _ctx_info.func), _args); + } + else + { + return _generate_key( + ::timemory::join::join(" @ ", _key, _ctx_info.func), _args); + } + } + else + { + auto _dl_info = Dl_info{ nullptr, nullptr, nullptr, nullptr }; + if(dladdr(codeptr_ra, &_dl_info) != 0) + { + _ctx_info.file = _dl_info.dli_fname; + _ctx_info.func = tim::demangle(_dl_info.dli_sname); + _args.emplace_back("file", _ctx_info.file); + return _generate_key( + ::timemory::join::join( + " @ ", _key, + ::timemory::join::join("", _ctx_info.func, " [", + _ctx_info.file, "]")), + _args); + } + } + + // since no line info could be deduced, include the codeptr return address + auto _args_codeptr_v = _args; + _args_codeptr_v.emplace_back("codeptr_ra", codeptr_ra); + return _generate_key(_key, _args_codeptr_v); + }; + + tim::openmp::get_function_lookup_callback< + api_t>() = [](ompt_function_lookup_t, + const std::optional& + params) { + if(!params) return; + + ROCPROFSYS_VERBOSE(3, "[ompt] configuring device %i...\n", params->device_num); + + auto& device_funcs = + tim::openmp::get_ompt_device_functions().at(params->device_num); + + device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_data_op); + device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_submit); + + static ompt_callback_buffer_request_t request = + [](int device_num, ompt_buffer_t** buffer, size_t* bytes) { + ROCPROFSYS_VERBOSE(3, "[ompt] buffer request...\n"); + *bytes = ::tim::units::get_page_size(); + *buffer = mmap(nullptr, *bytes, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + (void) device_num; + }; + + static ompt_callback_buffer_complete_t complete = [](int device_num, + ompt_buffer_t* buffer, + size_t bytes, + ompt_buffer_cursor_t + begin, + int buffer_owned) { + ROCPROFSYS_VERBOSE(3, "[ompt] buffer complete...\n"); + tim::consume_parameters(device_num, buffer, bytes, begin, buffer_owned); + + auto _funcs = + tim::openmp::get_ompt_device_functions().at(device_num); + auto _skew = rocprofsys::tracing::get_clock_skew( + [&_funcs]() { return _funcs.get_device_time(_funcs.device); }); + + ompt_buffer_cursor_t _cursor = begin; + size_t _nrecords = 0; + do + { + if(_cursor == 0) break; + ++_nrecords; + auto* _record = _funcs.get_record_ompt(buffer, _cursor); + if(_record) + { + const char* _type = tim::openmp::get_enum_label(_record->type); + auto _thrd_id = _record->thread_id; + auto _targ_id = _record->target_id; + + unsigned long beg_time = _record->time + _skew; + unsigned long end_time = 0; + ompt_id_t id = 0; + const char* _name = tim::openmp::get_enum_label(_record->type); + + if(_record->type == ompt_callback_target_submit) + { + auto& _data = _record->record.target_kernel; + end_time = _data.end_time + _skew; + id = _data.host_op_id; + + auto _ctx_info = tim::openmp::argument_array_t{ + { "begin_ns", beg_time }, + { "end_ns", end_time }, + { "type", _type }, + { "thread_id", _thrd_id }, + { "target_id", _targ_id }, + { "host_op_id", id }, + { "requested_num_teams", _data.requested_num_teams }, + { "granted_num_teams", _data.granted_num_teams } + }; + + component::ompt::record( + _name, id, beg_time, end_time, _thrd_id, _targ_id, + tim::openmp::context_info{ _name, nullptr, _ctx_info }); + } + else if(_record->type == ompt_callback_target_data_op) + { + auto& _data = _record->record.target_data_op; + end_time = _data.end_time + _skew; + id = _data.host_op_id; + const auto* _opname = + tim::openmp::get_enum_label(_data.optype); + + auto _ctx_info = tim::openmp::argument_array_t{ + { "begin_ns", beg_time }, + { "end_ns", end_time }, + { "type", _type }, + { "thread_id", _thrd_id }, + { "target_id", _targ_id }, + { "host_op_id", id }, + { "optype", _opname }, + { "src_addr", reinterpret_cast(_data.src_addr) }, + { "dst_addr", reinterpret_cast(_data.dest_addr) }, + { "src_device_num", _data.src_device_num }, + { "dst_device_num", _data.dest_device_num }, + { "bytes", _data.bytes }, + }; + + component::ompt::record( + _opname, id, beg_time, end_time, _thrd_id, _targ_id, + tim::openmp::context_info{ _name, nullptr, _ctx_info }); + } + + ROCPROFSYS_VERBOSE( + 3, + "type=%i, type_name=%s, start=%lu, end=%lu, delta=%lu, " + "tid=%lu, target_id=%lu, host_id=%lu\n", + _record->type, tim::openmp::get_enum_label(_record->type), + beg_time, end_time, (end_time - beg_time), _record->thread_id, + _record->target_id, id); + } + + _funcs.advance_buffer_cursor(_funcs.device, buffer, bytes, _cursor, + &_cursor); + } while(_cursor != 0); + + ROCPROFSYS_VERBOSE(3, "[ompt] number of records: %zu\n", _nrecords); + + if(buffer_owned == 1) + { + ::munmap(buffer, bytes); + } + }; + + device_funcs.start_trace(params->device, request, complete); + }; + + f_finalize = tim::ompt::configure(lookup, initial_device_num, tool_data); } return 1; // success } @@ -143,18 +504,23 @@ tool_finalize(ompt_data_t*) } // namespace ompt } // namespace rocprofsys -extern "C" ompt_start_tool_result_t* -ompt_start_tool(unsigned int omp_version, const char* runtime_version) +extern "C" { - ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n", - omp_version, runtime_version); - ROCPROFSYS_METADATA("OMP_VERSION", omp_version); - ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version); + ompt_start_tool_result_t* ompt_start_tool(unsigned int, + const char*) ROCPROFSYS_PUBLIC_API; - static auto* data = new ompt_start_tool_result_t{ &rocprofsys::ompt::tool_initialize, - &rocprofsys::ompt::tool_finalize, - { 0 } }; - return data; + ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version, + const char* runtime_version) + { + ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n", + omp_version, runtime_version); + ROCPROFSYS_METADATA("OMP_VERSION", omp_version); + ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version); + static auto* data = new ompt_start_tool_result_t{ + &rocprofsys::ompt::tool_initialize, &rocprofsys::ompt::tool_finalize, { 0 } + }; + return data; + } } #else diff --git a/source/lib/rocprof-sys/library/roctracer.cpp b/source/lib/rocprof-sys/library/roctracer.cpp index daf6bb8a..b7abcf71 100644 --- a/source/lib/rocprof-sys/library/roctracer.cpp +++ b/source/lib/rocprof-sys/library/roctracer.cpp @@ -163,55 +163,18 @@ get_clock_skew() static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true); if(!_use) return 0; static auto _v = []() { - namespace cpu = tim::cpu; - // synchronize timestamps - // We'll take a CPU timestamp before and after taking a GPU timestmp, then - // take the average of those two, hoping that it's roughly at the same time - // as the GPU timestamp. - static auto _cpu_now = []() { - cpu::fence(); - return comp::wall_clock::record(); - }; - - static auto _gpu_now = []() { - cpu::fence(); + auto _gpu_now = []() { uint64_t _ts = 0; - ROCPROFSYS_ROCTRACER_CALL(roctracer_get_timestamp(&_ts)); + roctracer_get_timestamp(&_ts); return _ts; }; - do - { - // warm up cache and allow for any static initialization - (void) _cpu_now(); - (void) _gpu_now(); - } while(false); - - auto _compute = [](volatile uint64_t& _cpu_ts, volatile uint64_t& _gpu_ts) { - _cpu_ts = 0; - _gpu_ts = 0; - _cpu_ts += _cpu_now() / 2; - _gpu_ts += _gpu_now() / 1; - _cpu_ts += _cpu_now() / 2; - return static_cast(_cpu_ts) - static_cast(_gpu_ts); - }; - constexpr int64_t _n = 10; - int64_t _cpu_ave = 0; - int64_t _gpu_ave = 0; - int64_t _diff = 0; - for(int64_t i = 0; i < _n; ++i) - { - volatile uint64_t _cpu_ts = 0; - volatile uint64_t _gpu_ts = 0; - _diff += _compute(_cpu_ts, _gpu_ts); - _cpu_ave += _cpu_ts / _n; - _gpu_ave += _gpu_ts / _n; - } - ROCPROFSYS_BASIC_VERBOSE(2, "CPU timestamp: %li\n", _cpu_ave); - ROCPROFSYS_BASIC_VERBOSE(2, "HIP timestamp: %li\n", _gpu_ave); + // discard (warm-up) + (void) tracing::get_clock_skew(_gpu_now, 1); + + auto _diff = tracing::get_clock_skew(_gpu_now, 10); ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff, _use ? "yes" : "no"); - _diff /= _n; return _diff; }(); return _v; diff --git a/source/lib/rocprof-sys/library/tracing.hpp b/source/lib/rocprof-sys/library/tracing.hpp index 3db7fe5a..624d30f8 100644 --- a/source/lib/rocprof-sys/library/tracing.hpp +++ b/source/lib/rocprof-sys/library/tracing.hpp @@ -651,5 +651,41 @@ mark_perfetto_track(CategoryT, const char* name, ::perfetto::Track _track, uint6 TRACE_EVENT_INSTANT(trait::name::value, ::perfetto::DynamicString{ name }, _track, _ts, std::forward(args)...); } + +template +int64_t +get_clock_skew(FuncT&& _timestamp_func, int64_t _n = 1) +{ + namespace cpu = tim::cpu; + // synchronize timestamps + // We'll take a CPU timestamp before and after taking a GPU timestmp, then + // take the average of those two, hoping that it's roughly at the same time + // as the GPU timestamp. + auto _cpu_now = []() { + cpu::fence(); + return now(); + }; + + auto _gpu_now = [&_timestamp_func]() { + cpu::fence(); + return std::forward(_timestamp_func)(); + }; + + auto _compute = [&_cpu_now, &_gpu_now]() { + volatile uint64_t _cpu_ts = 0; + volatile uint64_t _gpu_ts = 0; + _cpu_ts += _cpu_now(); + _gpu_ts += _gpu_now(); + _cpu_ts += _cpu_now(); + return static_cast(_cpu_ts / 2) - static_cast(_gpu_ts); + }; + + int64_t _diff = 0; + for(int64_t i = 0; i < _n; ++i) + { + _diff += _compute(); + } + return (_diff / _n); +} } // namespace tracing } // namespace rocprofsys