diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 9bd0c8ed..537e77d2 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -23,10 +23,10 @@ env: ROCM_PATH: "/opt/rocm" GPU_TARGETS: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102" PATH: "/usr/bin:$PATH" - navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" - vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" - mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" - mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$" + navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$" + vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$" + mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$" + mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$" navi3_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$" vega20_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$" mi200_EXCLUDE_LABEL_REGEX: "^(openmp-target)$" diff --git a/source/docs/data/memory_allocation_trace.csv b/source/docs/data/memory_allocation_trace.csv index d9733b27..34a8848f 100644 --- a/source/docs/data/memory_allocation_trace.csv +++ b/source/docs/data/memory_allocation_trace.csv @@ -1,4 +1,7 @@ -"Kind","Operation","Agent_Id","Allocation_Size","Starting_Address","Correlation_Id","Start_Timestamp","End_Timestamp" -"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497356288,1,65788054621500,65788055678893 -"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497348096,1,65788055691832,65788056666844 -"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497339904,1,65788056672061,65788057643457 +"Kind","Operation","Agent_Id","Allocation_Size","Address","Correlation_Id","Start_Timestamp","End_Timestamp" +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,11,3721742710532634,3721742710584854 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,12,3721742710596404,3721742710933366 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,13,3721742710941416,3721742710960916 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,14,3721742710967236,3721742711197647 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,15,3721742711204077,3721742711219717 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,16,3721742711225857,3721742711466018 diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index 5e0067f7..42363376 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -1318,5 +1318,5 @@ Properties - **`thread_id`** *(integer, required)*: Thread ID. - **`agent_id`** *(object, required)*: Agent ID. - **`handle`** *(integer, required)*: Handle of the agent. - - **`starting_address`** *(string, required)*: Starting address of allocation. + - **`address`** *(string, required)*: Starting address of allocation. - **`allocation_size`** *(integer, required)*: Size of allocation. diff --git a/source/docs/rocprofv3-schema.json b/source/docs/rocprofv3-schema.json index 4cbd97aa..ad68c19b 100644 --- a/source/docs/rocprofv3-schema.json +++ b/source/docs/rocprofv3-schema.json @@ -1566,13 +1566,13 @@ "handle" ] }, - "starting_address": { - "type": "integer", - "description": "Starting address of allocation" + "address": { + "type": "string", + "description": "Starting address for allocation or freeing memory." }, "allocation_size": { "type": "integer", - "description": "allocation_size" + "description": "Size of memory allocation. Free operations not tracked currently." } }, "required": [ @@ -1584,7 +1584,7 @@ "end_timestamp", "thread_id", "agent_id", - "starting_address", + "address", "allocation_size" ] } diff --git a/source/include/rocprofiler-sdk/buffer_tracing.h b/source/include/rocprofiler-sdk/buffer_tracing.h index 85557bfc..b0e22146 100644 --- a/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/source/include/rocprofiler-sdk/buffer_tracing.h @@ -217,9 +217,9 @@ typedef struct rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds - rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation - uint64_t starting_address; ///< starting address for memory allocation - uint64_t allocation_size; ///< size for memory allocation + rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation + rocprofiler_address_t address; ///< starting address for memory allocation + uint64_t allocation_size; ///< size for memory allocation /// @var kind /// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION /// @var operation diff --git a/source/include/rocprofiler-sdk/callback_tracing.h b/source/include/rocprofiler-sdk/callback_tracing.h index ecd74a94..eddff403 100644 --- a/source/include/rocprofiler-sdk/callback_tracing.h +++ b/source/include/rocprofiler-sdk/callback_tracing.h @@ -211,16 +211,16 @@ typedef struct } rocprofiler_callback_tracing_memory_copy_data_t; /** - * @brief ROCProfiler Memory Copy Allocation Tracer Record. + * @brief ROCProfiler Memory Allocation Tracer Record. */ typedef struct { - uint64_t size; ///< size of this struct - rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds - rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds - rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation - uint64_t starting_address; ///< starting address for memory allocation - uint64_t allocation_size; ///< size of memory allocation + uint64_t size; ///< size of this struct + rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds + rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds + rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation + rocprofiler_address_t address; ///< starting address for memory allocation + uint64_t allocation_size; ///< size of memory allocation } rocprofiler_callback_tracing_memory_allocation_data_t; /** diff --git a/source/include/rocprofiler-sdk/cxx/CMakeLists.txt b/source/include/rocprofiler-sdk/cxx/CMakeLists.txt index 0360b725..bacc4678 100644 --- a/source/include/rocprofiler-sdk/cxx/CMakeLists.txt +++ b/source/include/rocprofiler-sdk/cxx/CMakeLists.txt @@ -4,7 +4,7 @@ # # set(ROCPROFILER_CXX_HEADER_FILES hash.hpp name_info.hpp operators.hpp perfetto.hpp - serialization.hpp) + utility.hpp serialization.hpp) install( FILES ${ROCPROFILER_CXX_HEADER_FILES} diff --git a/source/include/rocprofiler-sdk/cxx/serialization.hpp b/source/include/rocprofiler-sdk/cxx/serialization.hpp index 216e4a3e..e92dadc9 100644 --- a/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -64,6 +66,7 @@ #define ROCP_SDK_SAVE_DATA_FIELD(FIELD) ar(make_nvp(#FIELD, data.FIELD)) #define ROCP_SDK_SAVE_DATA_VALUE(NAME, VALUE) ar(make_nvp(NAME, data.VALUE)) +#define ROCP_SDK_SAVE_VALUE(NAME, VALUE) ar(make_nvp(NAME, VALUE)) #define ROCP_SDK_SAVE_DATA_CSTR(FIELD) \ ar(make_nvp(#FIELD, std::string{data.FIELD ? data.FIELD : ""})) #define ROCP_SDK_SAVE_DATA_BITFIELD(NAME, VALUE) \ @@ -338,7 +341,7 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_allocation_data_t data) ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(starting_address); + ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16)); ROCP_SDK_SAVE_DATA_FIELD(allocation_size); } @@ -543,7 +546,7 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data) ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(starting_address); + ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16)); ROCP_SDK_SAVE_DATA_FIELD(allocation_size); } diff --git a/source/include/rocprofiler-sdk/cxx/utility.hpp b/source/include/rocprofiler-sdk/cxx/utility.hpp new file mode 100644 index 00000000..c81fc0bd --- /dev/null +++ b/source/include/rocprofiler-sdk/cxx/utility.hpp @@ -0,0 +1,54 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// + +#pragma once + +#include +#include + +namespace rocprofiler +{ +namespace sdk +{ +namespace utility +{ +template +auto +_as_hex(Tp val, size_t width = 0) +{ + auto ss = std::stringstream{}; + ss << "0x" << std::hex << std::setw(width) << std::setfill('0') << val; + return ss.str(); +} + +#define ROCPROFILER_CXX_DEFINE_AS_HEX(TYPE) \ + inline auto as_hex(TYPE val, size_t width = 0) \ + { \ + return ::rocprofiler::sdk::utility::_as_hex(val, width); \ + } + +ROCPROFILER_CXX_DEFINE_AS_HEX(uint64_t) +#undef ROCPROFILER_CXX_DEFINE_AS_HEX +} // namespace utility +} // namespace sdk +} // namespace rocprofiler diff --git a/source/include/rocprofiler-sdk/fwd.h b/source/include/rocprofiler-sdk/fwd.h index 09fb8237..52fe1c0c 100644 --- a/source/include/rocprofiler-sdk/fwd.h +++ b/source/include/rocprofiler-sdk/fwd.h @@ -238,10 +238,11 @@ typedef enum // NOLINT(performance-enum-size) */ typedef enum // NOLINT(performance-enum-size) { - ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function - ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function - ROCPROFILER_MEMORY_ALLOCATION_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool - ROCPROFILER_MEMORY_ALLOCATION_VMEM_HANDLE_CREATE, ///< Allocate vmem memory handle + ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function + ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function + ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE, ///< Allocate vmem memory handle + ROCPROFILER_MEMORY_ALLOCATION_FREE, ///< Free memory function + ROCPROFILER_MEMORY_ALLOCATION_VMEM_FREE, ///< Release vmem memory handle ROCPROFILER_MEMORY_ALLOCATION_LAST, } rocprofiler_memory_allocation_operation_t; @@ -458,11 +459,6 @@ typedef enum */ typedef uint64_t rocprofiler_timestamp_t; -/** - * @brief ROCProfiler Address. - */ -typedef uint64_t rocprofiler_address_t; - /** * @brief Thread ID. Value will be equivalent to `syscall(__NR_gettid)` */ @@ -519,6 +515,16 @@ typedef union rocprofiler_user_data_t void* ptr; ///< usage example: set to address of data allocation } rocprofiler_user_data_t; +/** + * @brief Stores memory address for profiling + * + */ +typedef union rocprofiler_address_t +{ + uint64_t value; ///< usage example: store address in uint64_t format + void* ptr; ///< usage example: generic form of address +} rocprofiler_address_t; + //--------------------------------------------------------------------------------------// // // STRUCTS diff --git a/source/lib/output/generateCSV.cpp b/source/lib/output/generateCSV.cpp index 9161c57a..12a308a2 100644 --- a/source/lib/output/generateCSV.cpp +++ b/source/lib/output/generateCSV.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -456,7 +457,7 @@ generate_csv(const output_config& "Operation", "Agent_Id", "Allocation_Size", - "Starting_Address", + "Address", "Correlation_Id", "Start_Timestamp", "End_Timestamp"}}; @@ -464,15 +465,24 @@ generate_csv(const output_config& { for(auto record : data.get(ditr)) { + uint64_t agent_info{0}; + // Free functions currently do not track agent information. Only set it on allocation + // operations, otherwise set it to 0 currently + if(record.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE || + record.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE) + { + agent_info = tool_metadata.get_node_id(record.agent_id); + } auto api_name = tool_metadata.get_operation_name(record.kind, record.operation); auto row_ss = std::stringstream{}; + rocprofiler::tool::csv::memory_allocation_csv_encoder::write_row( row_ss, tool_metadata.get_kind_name(record.kind), api_name, - tool_metadata.get_node_id(record.agent_id), + agent_info, record.allocation_size, - record.starting_address, + rocprofiler::sdk::utility::as_hex(record.address.value, 16), record.correlation_id.internal, record.start_timestamp, record.end_timestamp); diff --git a/source/lib/output/generateOTF2.cpp b/source/lib/output/generateOTF2.cpp index 7f9839de..555bf150 100644 --- a/source/lib/output/generateOTF2.cpp +++ b/source/lib/output/generateOTF2.cpp @@ -486,17 +486,25 @@ write_otf2( { for(auto& [agent, evt] : itr) { - const auto* _agent = _get_agent(agent); - auto _type_name = std::string_view{"UNK"}; - if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + // Free functions do not track agent information. Below handles case where + // null rocprof agent id is passed to generate OTF2 + constexpr auto null_rocp_agent_id = + rocprofiler_agent_id_t{.handle = std::numeric_limits::max()}; + const rocprofiler_agent_t* _agent = nullptr; + if(agent != null_rocp_agent_id) + { + _agent = _get_agent(agent); + } + auto _type_name = std::string_view{"UNK"}; + if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU) _type_name = "CPU"; - else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) _type_name = "GPU"; - evt.name = fmt::format("Thread {}, Memory Allocation at {} {}", + evt.name = fmt::format("Thread {}, Memory Operation at {} {}", tid, _type_name, - _agent->logical_node_type_id); + _agent == nullptr ? 0 : _agent->logical_node_type_id); } } @@ -860,6 +868,12 @@ write_otf2( for(auto& [agent, evt] : itr) { auto _hash = get_hash_id(evt.name); + // Using max numeric limits results in an out-of-bound runtime error for OTF2 + // and perfetto for agent ids. Setting handle to 0 for free functions. + constexpr auto null_rocp_agent_id = + rocprofiler_agent_id_t{.handle = std::numeric_limits::max()}; + auto handle = agent.handle; + if(agent == null_rocp_agent_id) handle = 0; add_write_string(_hash, evt.name); OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer, @@ -867,7 +881,7 @@ write_otf2( _hash, OTF2_LOCATION_TYPE_ACCELERATOR_STREAM, 2 * evt.event_count, // # events - agent.handle // location group + handle // location group )); } } diff --git a/source/lib/output/generatePerfetto.cpp b/source/lib/output/generatePerfetto.cpp index fa16fe65..0cbfb010 100644 --- a/source/lib/output/generatePerfetto.cpp +++ b/source/lib/output/generatePerfetto.cpp @@ -241,33 +241,6 @@ write_perfetto( } } - for(const auto& itr : agent_thread_ids_alloc) - { - const auto* _agent = _get_agent(itr.first); - - for(auto titr : itr.second) - { - auto _namess = std::stringstream{}; - _namess << "MEMORY ALLOCATION on AGENT [" << _agent->logical_node_id << "] THREAD [" - << thread_indexes.at(titr) << "] "; - - if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _namess << "(CPU)"; - else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _namess << "(GPU)"; - else - _namess << "(UNK)"; - - auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; - auto _desc = _track.Serialize(); - _desc.set_name(_namess.str()); - - perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); - - agent_thread_tracks_alloc[itr.first].emplace(titr, _track); - } - } - for(const auto& aitr : agent_queue_ids) { uint32_t nqueue = 0; @@ -463,47 +436,6 @@ write_perfetto( tracing_session->FlushBlocking(); } - for(auto ditr : memory_allocation_gen) - for(auto itr : memory_allocation_gen.get(ditr)) - { - auto name = buffer_names.at(itr.kind, itr.operation); - auto& track = agent_thread_tracks_alloc.at(itr.agent_id).at(itr.thread_id); - std::stringstream hex_stream; - hex_stream << "0x" << std::hex << std::setw(16) << std::setfill('0') - << itr.starting_address; - std::string hex_starting_address(hex_stream.str()); - - TRACE_EVENT_BEGIN(sdk::perfetto_category::name, - ::perfetto::StaticString(name.data()), - track, - itr.start_timestamp, - ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), - "begin_ns", - itr.start_timestamp, - "end_ns", - itr.end_timestamp, - "delta_ns", - (itr.end_timestamp - itr.start_timestamp), - "kind", - itr.kind, - "operation", - itr.operation, - "agent", - agents_map.at(itr.agent_id).logical_node_id, - "allocation_size", - itr.allocation_size, - "starting_address", - hex_starting_address, - "corr_id", - itr.correlation_id.internal, - "tid", - itr.thread_id); - TRACE_EVENT_END(sdk::perfetto_category::name, - track, - itr.end_timestamp); - tracing_session->FlushBlocking(); - } - for(auto ditr : kernel_dispatch_gen) for(auto itr : kernel_dispatch_gen.get(ditr)) { @@ -634,6 +566,99 @@ write_perfetto( tracing_session->FlushBlocking(); } } + + // memory allocation counter track + auto mem_alloc_endpoints = std::map>{}; + auto mem_alloc_extremes = std::pair{}; + auto address_to_size = std::unordered_map{}; + for(auto ditr : memory_allocation_gen) + for(auto itr : memory_allocation_gen.get(ditr)) + { + uint64_t _mean_timestamp = + itr.start_timestamp + (0.5 * (itr.end_timestamp - itr.start_timestamp)); + + mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp - 1000, 0); + mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp, 0); + mem_alloc_endpoints[itr.agent_id].emplace(_mean_timestamp, 0); + mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp, 0); + mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp + 1000, 0); + + mem_alloc_extremes = + std::make_pair(std::min(mem_alloc_extremes.first, itr.start_timestamp), + std::max(mem_alloc_extremes.second, itr.end_timestamp)); + if(itr.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE || + itr.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE) + { + address_to_size.emplace(itr.address.value, itr.allocation_size); + } + } + + for(auto ditr : memory_allocation_gen) + for(auto itr : memory_allocation_gen.get(ditr)) + { + auto alloc_beg = + mem_alloc_endpoints.at(itr.agent_id).lower_bound(itr.start_timestamp); + auto alloc_end = + mem_alloc_endpoints.at(itr.agent_id).upper_bound(itr.end_timestamp); + + LOG_IF(FATAL, alloc_beg == alloc_end) + << "Missing range for timestamp [" << itr.start_timestamp << ", " + << itr.end_timestamp << "]"; + + for(auto alloc_itr = alloc_beg; alloc_itr != alloc_end; ++alloc_itr) + { + if(address_to_size.count(itr.address.value) > 0) + { + alloc_itr->second += address_to_size.at(itr.address.value); + } + } + } + + auto mem_alloc_tracks = + std::unordered_map{}; + auto mem_alloc_cnt_names = std::vector{}; + constexpr auto null_rocp_agent_id = + rocprofiler_agent_id_t{.handle = std::numeric_limits::max()}; + mem_alloc_cnt_names.reserve(mem_alloc_endpoints.size()); + for(auto& alloc_itr : mem_alloc_endpoints) + { + mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.first - 5000, 0); + mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.second + 5000, 0); + + auto _track_name = std::stringstream{}; + const rocprofiler_agent_t* _agent = nullptr; + if(alloc_itr.first != null_rocp_agent_id) + { + _agent = _get_agent(alloc_itr.first); + } + + if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (CPU)"; + else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (GPU)"; + else + _track_name << "FREE BYTES"; + + constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES; + auto& _name = mem_alloc_cnt_names.emplace_back(_track_name.str()); + mem_alloc_tracks.emplace(alloc_itr.first, + ::perfetto::CounterTrack{_name.c_str()} + .set_unit(_unit) + .set_unit_multiplier(bytes_multiplier) + .set_is_incremental(false)); + } + + for(auto& alloc_itr : mem_alloc_endpoints) + { + for(auto itr : alloc_itr.second) + { + TRACE_COUNTER(sdk::perfetto_category::name, + mem_alloc_tracks.at(alloc_itr.first), + itr.first, + itr.second / bytes_multiplier); + tracing_session->FlushBlocking(); + } + } } ::perfetto::TrackEvent::Flush(); diff --git a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp index 4d0c1dfd..29c1ddae 100644 --- a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp +++ b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp @@ -481,6 +481,9 @@ HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, hsa_memory_allocate, hsa_memory_allocate_fn) HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate_fn) HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create_fn) +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free, hsa_memory_free, hsa_memory_free_fn) +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free, hsa_amd_memory_pool_free, hsa_amd_memory_pool_free_fn) +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release_fn) // clang-format on #else diff --git a/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp b/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp index 38648d64..c249bcf3 100644 --- a/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp +++ b/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp @@ -68,49 +68,125 @@ using memory_pool_to_agent_map = std::unordered_map; using map_pool_to_agent_pair = std::pair; +template +hsa_status_t +memory_allocation_impl(Args... args); + +template +hsa_status_t +memory_free_impl(Args... args); + +// Local enum to specify implementation of memory function wrappers +typedef enum +{ + HSA_NONE = 0, ///< Unknown memory allocation function + HSA_MEMORY_ALLOCATE, ///< Allocate memory function + HSA_AMD_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool + HSA_AMD_VMEM_ALLOCATE, ///< Allocate vmem memory handle + HSA_MEMORY_FREE, ///< Free memory function + HSA_AMD_MEMORY_POOL_FREE, ///< Free memory pool + HSA_AMD_VMEM_FREE, ///< Release vmem memory handle + HSA_LAST, +} hsa_memory_operation_functions_t; + // Set up information to identify agent from regions/pool template struct memory_allocation_info; -#define SPECIALIZE_MEMORY_ALLOCATION_INFO(FUNCTION, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC) \ +#define SPECIALIZE_MEMORY_ALLOCATION_INFO( \ + FUNCTION, ENUM, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC, IMPLEMENTATION) \ template <> \ - struct memory_allocation_info \ + struct memory_allocation_info \ { \ using maptype = MAPTYPE; \ using pairtype = PAIRTYPE; \ using searchtype = SEARCHTYPE; \ auto& operator()() const { return ITERATEFUNC; } \ - static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \ - static constexpr auto name = "MEMORY_ALLOCATION_" #FUNCTION; \ + static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \ + static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \ + \ + template \ + static auto get_memory_allocation_impl(RetT (*)(Args...)) \ + { \ + return &IMPLEMENTATION; \ + } \ }; -SPECIALIZE_MEMORY_ALLOCATION_INFO(NONE, +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_NONE, + NONE, region_to_agent_map, region_to_agent_pair, hsa_region_t, - get_core_table()->hsa_agent_iterate_regions_fn) -SPECIALIZE_MEMORY_ALLOCATION_INFO(ALLOCATE, + get_core_table()->hsa_agent_iterate_regions_fn, + memory_allocation_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_ALLOCATE, + ALLOCATE, region_to_agent_map, region_to_agent_pair, hsa_region_t, - get_core_table()->hsa_agent_iterate_regions_fn) -SPECIALIZE_MEMORY_ALLOCATION_INFO(MEMORY_POOL_ALLOCATE, + get_core_table()->hsa_agent_iterate_regions_fn, + memory_allocation_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_ALLOCATE, + ALLOCATE, memory_pool_to_agent_map, map_pool_to_agent_pair, hsa_amd_memory_pool_t, - get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn) -SPECIALIZE_MEMORY_ALLOCATION_INFO(VMEM_HANDLE_CREATE, + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn, + memory_allocation_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_ALLOCATE, + VMEM_ALLOCATE, memory_pool_to_agent_map, map_pool_to_agent_pair, hsa_amd_memory_pool_t, - get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn) + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn, + memory_allocation_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_FREE, + FREE, + region_to_agent_map, + region_to_agent_pair, + hsa_region_t, + get_core_table()->hsa_agent_iterate_regions_fn, + memory_free_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_FREE, + FREE, + memory_pool_to_agent_map, + map_pool_to_agent_pair, + hsa_amd_memory_pool_t, + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn, + memory_free_impl) +SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_FREE, + VMEM_FREE, + memory_pool_to_agent_map, + map_pool_to_agent_pair, + hsa_amd_memory_pool_t, + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn, + memory_free_impl) #undef SPECIALIZE_MEMORY_ALLOCATION_INFO +// Map rocprofiler_memory_allocation_operation_t to respective name +template +struct memory_allocation_name; + +#define MEMORY_ALLOCATION_NAME(ENUM) \ + template <> \ + struct memory_allocation_name \ + { \ + static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \ + static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \ + }; + +MEMORY_ALLOCATION_NAME(NONE) +MEMORY_ALLOCATION_NAME(ALLOCATE) +MEMORY_ALLOCATION_NAME(VMEM_ALLOCATE) +MEMORY_ALLOCATION_NAME(FREE) +MEMORY_ALLOCATION_NAME(VMEM_FREE) +#undef MEMORY_ALLOCATION_NAME + template const char* name_by_id(const uint32_t id, std::index_sequence) { - if(Idx == id) return memory_allocation_info::name; + if(Idx == id) return memory_allocation_name::name; if constexpr(sizeof...(IdxTail) > 0) return name_by_id(id, std::index_sequence{}); else @@ -121,8 +197,8 @@ template uint32_t id_by_name(const char* name, std::index_sequence) { - if(std::string_view{memory_allocation_info::name} == std::string_view{name}) - return memory_allocation_info::operation_idx; + if(std::string_view{memory_allocation_name::name} == std::string_view{name}) + return memory_allocation_name::operation_idx; if constexpr(sizeof...(IdxTail) > 0) return id_by_name(name, std::index_sequence{}); else @@ -137,7 +213,7 @@ get_ids(std::vector& _id_list, std::index_sequence) if(_v < static_cast(ROCPROFILER_MEMORY_ALLOCATION_LAST)) _vec.emplace_back(_v); }; - (_emplace(_id_list, memory_allocation_info::operation_idx), ...); + (_emplace(_id_list, memory_allocation_name::operation_idx), ...); } template @@ -148,7 +224,7 @@ get_names(std::vector& _name_list, std::index_sequence) if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v); }; - (_emplace(_name_list, memory_allocation_info::name), ...); + (_emplace(_name_list, memory_allocation_name::name), ...); } bool @@ -169,17 +245,24 @@ context_filter(const context::context* ctx) enum memory_allocation_core_id { memory_allocation_core_allocate_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, + memory_allocation_core_free_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free, }; -using memory_allocation_core_index_seq_t = std::index_sequence; +using memory_allocation_core_index_seq_t = + std::index_sequence; enum memory_allocation_amd_ext_id { memory_allocation_amd_ext_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, - memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create + memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, + memory_allocation_amd_ext_free_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free, + memory_allocation_vmem_release_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release, }; using memory_allocation_amd_ext_index_seq_t = - std::index_sequence; + std::index_sequence; template struct memory_allocation_seq; @@ -203,14 +286,17 @@ struct arg_indices; template <> \ struct arg_indices \ { \ - static constexpr auto starting_address_idx = STARTING_ADDRESS_IDX; \ - static constexpr auto size_idx = SIZE_IDX; \ - static constexpr auto region_idx = REGION_IDX; \ + static constexpr auto address_idx = STARTING_ADDRESS_IDX; \ + static constexpr auto size_idx = SIZE_IDX; \ + static constexpr auto region_idx = REGION_IDX; \ }; HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_allocate_id, 2, 1, 0) HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_allocate_id, 3, 1, 0) HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_allocate_id, 4, 1, 0) +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_free_id, 0, 0, 0) +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_free_id, 0, 0, 0) +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_release_id, 0, 0, 0) // Define operation indices for each tracked functions template @@ -220,12 +306,15 @@ struct memory_allocation_op; template <> \ struct memory_allocation_op \ { \ - static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \ + static constexpr auto operation_idx = FUNCTION; \ }; -MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, ALLOCATE); -MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, MEMORY_POOL_ALLOCATE); -MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, VMEM_HANDLE_CREATE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, HSA_MEMORY_ALLOCATE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, HSA_AMD_MEMORY_POOL_ALLOCATE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, HSA_AMD_VMEM_ALLOCATE) +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_free_id, HSA_MEMORY_FREE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_free_id, HSA_AMD_MEMORY_POOL_FREE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_release_id, HSA_AMD_VMEM_FREE); template decltype(auto) @@ -255,7 +344,7 @@ struct memory_allocation_data rocprofiler_thread_id_t tid = common::get_tid(); rocprofiler_agent_id_t agent = null_rocp_agent_id; uint64_t size_allocated = 0; - uint64_t starting_addr = 0; + rocprofiler_address_t address = {.value = 0}; uint64_t start_ts = 0; context::correlation_id* correlation_id = nullptr; tracing::tracing_data tracing_data = {}; @@ -271,7 +360,7 @@ memory_allocation_data::callback_data_t memory_allocation_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const { return common::init_public_api_struct( - callback_data_t{}, _beg, _end, agent, starting_addr, size_allocated); + callback_data_t{}, _beg, _end, agent, address, size_allocated); } memory_allocation_data::buffered_data_t @@ -291,7 +380,7 @@ memory_allocation_data::get_buffered_record(const context_t* _ctx, _beg, _end, agent, - starting_addr, + address, size_allocated); } @@ -336,16 +425,32 @@ get_agent(T val, IterateFunc iterate_func, CallbackFunc callback) return existing.count(val) == 0 ? null_rocp_agent_id : existing.at(val); } -uint64_t +void* handle_starting_addr(void** starting_addr_pointer) { - return reinterpret_cast(*starting_addr_pointer); + return *starting_addr_pointer; } -uint64_t +// The handle field of hsa_amd_vmem_alloc_handle_t is the starting address +// cast as uint64_t, so returning the handle field after casting to void* suffices +void* handle_starting_addr(hsa_amd_vmem_alloc_handle_t* vmem_alloc_handle) { - return vmem_alloc_handle->handle; + return reinterpret_cast(vmem_alloc_handle->handle); +} + +// Handling starting address for free memory operations +void* +handle_starting_addr(void* starting_addr_pointer) +{ + return starting_addr_pointer; +} + +// Handles starting address for releasing handle +void* +handle_starting_addr(hsa_amd_vmem_alloc_handle_t vmem_alloc_handle) +{ + return reinterpret_cast(vmem_alloc_handle.handle); } // Wrapper implementation that stores memory allocation information @@ -353,11 +458,12 @@ template hsa_status_t memory_allocation_impl(Args... args) { - constexpr auto N = sizeof...(Args); - constexpr auto starting_address_idx = arg_indices::starting_address_idx; - constexpr auto size_idx = arg_indices::size_idx; - constexpr auto region_idx = arg_indices::region_idx; - constexpr auto operation = memory_allocation_op::operation_idx; + constexpr auto N = sizeof...(Args); + constexpr auto address_idx = arg_indices::address_idx; + constexpr auto size_idx = arg_indices::size_idx; + constexpr auto region_idx = arg_indices::region_idx; + constexpr auto operation = memory_allocation_op::operation_idx; + constexpr auto rocprofiler_enum = memory_allocation_info::operation_idx; auto&& _tied_args = std::tie(args...); memory_allocation_data _data{}; @@ -380,7 +486,7 @@ memory_allocation_impl(Args... args) } auto& tracing_data = _data.tracing_data; - auto starting_addr_pointer = std::get(_tied_args); + auto starting_addr_pointer = std::get(_tied_args); auto region_or_pool = std::get(_tied_args); _data.tid = common::get_tid(); @@ -389,7 +495,7 @@ memory_allocation_impl(Args... args) memory_allocation_info{}(), callback_populate_map::searchtype>); _data.size_allocated = std::get(_tied_args); - _data.func = operation; + _data.func = rocprofiler_enum; _data.correlation_id = context::get_latest_correlation_id(); if(!_data.correlation_id) @@ -405,7 +511,7 @@ memory_allocation_impl(Args... args) tracing_data.external_correlation_ids, thr_id, ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, - operation, + rocprofiler_enum, _data.correlation_id->internal); if(!tracing_data.callback_contexts.empty()) @@ -417,7 +523,7 @@ memory_allocation_impl(Args... args) _data.correlation_id->internal, tracing_data.external_correlation_ids, ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, - operation, + rocprofiler_enum, _tracer_data); // enter callback may update the external correlation id field tracing::update_external_correlation_ids( @@ -433,7 +539,7 @@ memory_allocation_impl(Args... args) // checks before retrieving starting address? if(starting_addr_pointer != nullptr) { - _data.starting_addr = handle_starting_addr(starting_addr_pointer); + _data.address.ptr = handle_starting_addr(starting_addr_pointer); } if(!tracing_data.empty()) @@ -445,7 +551,7 @@ memory_allocation_impl(Args... args) tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts, _data.tracing_data.external_correlation_ids, ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, - operation, + rocprofiler_enum, _tracer_data); } @@ -458,7 +564,7 @@ memory_allocation_impl(Args... args) _data.correlation_id->internal, _data.tracing_data.external_correlation_ids, ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, - operation, + rocprofiler_enum, record); } } @@ -468,11 +574,113 @@ memory_allocation_impl(Args... args) return _ret; } -template -auto get_memory_allocation_impl(RetT (*)(Args...)) +// Wrapper implementation that stores memory free operation information +template +hsa_status_t +memory_free_impl(Args... args) { - return &memory_allocation_impl; + constexpr auto N = sizeof...(Args); + constexpr auto address_idx = arg_indices::address_idx; + constexpr auto operation = memory_allocation_op::operation_idx; + constexpr auto rocprofiler_enum = memory_allocation_info::operation_idx; + + auto&& _tied_args = std::tie(args...); + memory_allocation_data _data{}; + + { + auto tracing_data = tracing::tracing_data{}; + + tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + OpIdx, + tracing_data); + // if no contexts are tracing memory copies for this direction, execute as usual + if(tracing_data.empty()) + { + return invoke(get_next_dispatch(), + std::move(_tied_args), + std::make_index_sequence{}); + } + _data.tracing_data = std::move(tracing_data); + } + + auto& tracing_data = _data.tracing_data; + + _data.tid = common::get_tid(); + _data.func = rocprofiler_enum; + _data.correlation_id = context::get_latest_correlation_id(); + _data.address.ptr = handle_starting_addr(std::get(_tied_args)); + + if(!_data.correlation_id) + { + constexpr auto ref_count = 1; + _data.correlation_id = context::correlation_tracing_service::construct(ref_count); + } + + // increase the reference count to denote that this correlation id is being used in a kernel + _data.correlation_id->add_ref_count(); + auto thr_id = _data.correlation_id->thread_idx; + tracing::populate_external_correlation_ids( + tracing_data.external_correlation_ids, + thr_id, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, + rocprofiler_enum, + _data.correlation_id->internal); + + if(!tracing_data.callback_contexts.empty()) + { + auto _tracer_data = _data.get_callback_data(); + + tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts, + thr_id, + _data.correlation_id->internal, + tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + rocprofiler_enum, + _tracer_data); + // enter callback may update the external correlation id field + tracing::update_external_correlation_ids( + tracing_data.external_correlation_ids, + thr_id, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION); + } + auto start_ts = common::timestamp_ns(); + auto _ret = invoke( + get_next_dispatch(), std::move(_tied_args), std::make_index_sequence{}); + auto end_ts = common::timestamp_ns(); + + if(!tracing_data.empty()) + { + if(!_data.tracing_data.callback_contexts.empty()) + { + auto _tracer_data = _data.get_callback_data(start_ts, end_ts); + + tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts, + _data.tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + rocprofiler_enum, + _tracer_data); + } + + if(!_data.tracing_data.buffered_contexts.empty()) + { + auto record = _data.get_buffered_record(nullptr, start_ts, end_ts); + + tracing::execute_buffer_record_emplace(_data.tracing_data.buffered_contexts, + _data.tid, + _data.correlation_id->internal, + _data.tracing_data.external_correlation_ids, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + rocprofiler_enum, + record); + } + } + + // decrement the reference count after usage in the callback/buffers + _data.correlation_id->sub_ref_count(); + return _ret; } + } // namespace // check out the assembly here... this compiles to a switch statement const char* @@ -573,7 +781,9 @@ memory_allocation_wrap(Tp* _orig, std::integral_constant) auto& _dispatch = get_next_dispatch(); CHECK_NOTNULL(_dispatch); - _func = get_memory_allocation_impl(_func); + constexpr auto LocalIdx = memory_allocation_op::operation_idx; + _func = memory_allocation_info::template get_memory_allocation_impl( + _func); } template diff --git a/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp b/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp index 2b94eb32..d743c8ed 100644 --- a/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp +++ b/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp @@ -170,6 +170,8 @@ call_hsa_memory_allocate(const size_t i, const size_t base_size, hsa_agent_t age status = hsa_memory_allocate(region_list[0], base_size, &addr); RET_IF_HSA_ERR(status) + status = hsa_memory_free(addr); + RET_IF_HSA_ERR(status) } } @@ -199,6 +201,8 @@ call_hsa_memory_pool_allocate(const size_t i, const size_t base_size, hsa_agent_ status = hsa_amd_memory_pool_allocate(memory_pool_list[0], base_size, flags, &addr); RET_IF_HSA_ERR(status) + status = hsa_amd_memory_pool_free(addr); + RET_IF_HSA_ERR(status) } } @@ -243,6 +247,8 @@ call_hsa_vmem_allocate(const size_t i, hsa_agent_t agent) status = hsa_amd_vmem_handle_create( memory_pool_list[0], size, MEMORY_TYPE_NONE, 0, &memory_handle); RET_IF_HSA_ERR(status) + status = hsa_amd_vmem_handle_release(memory_handle); + RET_IF_HSA_ERR(status) } } @@ -257,7 +263,7 @@ main() hsa_agent_t cpu_agent = get_cpu_agent(agents); hsa_agent_t gpu_agent = get_gpu_agent(agents); call_hsa_memory_allocate(6, 1024, cpu_agent); - call_hsa_memory_pool_allocate(9, 512, gpu_agent); + call_hsa_memory_pool_allocate(9, 2048, gpu_agent); // Virtual memory API not supported in CI. Will add back if this changes // call_hsa_vmem_allocate(3, gpu_agent); diff --git a/tests/hsa-memory-allocation/validate.py b/tests/hsa-memory-allocation/validate.py index 56ec4bcd..ab10447d 100644 --- a/tests/hsa-memory-allocation/validate.py +++ b/tests/hsa-memory-allocation/validate.py @@ -175,25 +175,26 @@ def test_memory_alloc_sizes(input_data): # Op values: # 0 == ??? (unknown) # 1 == hsa_memory_allocate - # 2 == hsa_amd_ext_memory_allocate - # 3 == hsa_amd_vmem_handle_create + # 2 == hsa_amd_vmem_handle_create + # 3 == hsa_memory_free + # 4 == hsa_amd_vmem_handle_release memory_alloc_cnt = dict( [ (idx, {"agent": set(), "starting_addr": set(), "size": set(), "count": 0}) - for idx in range(1, 4) + for idx in range(1, 5) ] ) for itr in sdk_data["buffer_records"]["memory_allocations"]: op_id = itr["operation"] - assert op_id > 0 and op_id <= 3, f"{itr}" + assert op_id > 0 and op_id <= 5, f"{itr}" memory_alloc_cnt[op_id]["count"] += 1 - memory_alloc_cnt[op_id]["starting_addr"].add(itr.starting_address) + memory_alloc_cnt[op_id]["starting_addr"].add(itr.address) memory_alloc_cnt[op_id]["size"].add(itr.allocation_size) memory_alloc_cnt[op_id]["agent"].add(itr.agent_id.handle) for itr in sdk_data["callback_records"]["memory_copies"]: op_id = itr.operation - assert op_id > 0 and op_id <= 3, f"{itr}" + assert op_id > 0 and op_id <= 5, f"{itr}" memory_alloc_cnt[op_id]["count"] += 1 phase = itr.phase @@ -210,7 +211,7 @@ def test_memory_alloc_sizes(input_data): assert pitr.end_timestamp > 0, f"{itr}" assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}" - memory_alloc_cnt[op_id]["starting_addr"].add(pitr.starting_address) + memory_alloc_cnt[op_id]["starting_addr"].add(pitr.address) memory_alloc_cnt[op_id]["size"].add(pitr.allocation_size) memory_alloc_cnt[op_id]["agent"].add(pitr.agent_id.handle) else: @@ -218,24 +219,22 @@ def test_memory_alloc_sizes(input_data): # In the memory allocation test which generates this file # 6 hsa_memory_allocation calls with 1024 bytes were called - # and 9 hsa_amd_memory_pool_allocations with 512 bytes + # and 9 hsa_amd_memory_pool_allocations with 2048 bytes # were called - assert memory_alloc_cnt[1]["count"] == 6 - assert memory_alloc_cnt[2]["count"] == 9 + assert memory_alloc_cnt[1]["count"] == 15 + assert memory_alloc_cnt[3]["count"] == 15 # assert memory_alloc_cnt[3]["count"] == 3 - assert len(memory_alloc_cnt[1]["starting_addr"]) == 6 - assert len(memory_alloc_cnt[2]["starting_addr"]) == 9 + assert len(memory_alloc_cnt[1]["starting_addr"]) == len( + memory_alloc_cnt[3]["starting_addr"] + ) + # assert len(memory_alloc_cnt[3]["starting_addr"]) == 3 - assert len(memory_alloc_cnt[1]["size"]) == 1 - assert len(memory_alloc_cnt[2]["size"]) == 1 + assert len(memory_alloc_cnt[1]["size"]) == 2 # assert len(memory_alloc_cnt[3]["size"]) == 1 assert 1024 in memory_alloc_cnt[1]["size"] - assert 512 in memory_alloc_cnt[2]["size"] - assert len(memory_alloc_cnt[1]["agent"]) == 1 - assert len(memory_alloc_cnt[2]["agent"]) == 1 + assert 2048 in memory_alloc_cnt[1]["size"] + assert len(memory_alloc_cnt[1]["agent"]) == 2 # assert len(memory_alloc_cnt[3]["agent"]) == 1 - assert memory_alloc_cnt[1]["agent"] != memory_alloc_cnt[2]["agent"] - # assert memory_alloc_cnt[2]["agent"] == memory_alloc_cnt[3]["agent"] def test_retired_correlation_ids(input_data): diff --git a/tests/rocprofv3/aborted-app/CMakeLists.txt b/tests/rocprofv3/aborted-app/CMakeLists.txt index de4d730a..4d4720d3 100644 --- a/tests/rocprofv3/aborted-app/CMakeLists.txt +++ b/tests/rocprofv3/aborted-app/CMakeLists.txt @@ -19,9 +19,11 @@ string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV # disable this test for thread sanitizers because of "signal-unsafe call inside signal" # issues on mi200 and mi300 (works fine on vega20 and navi32) if(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer") - set(IS_THREAD_SANITIZER ON) + set(DISABLE_THIS_TEST ON) else() - set(IS_THREAD_SANITIZER OFF) + # set(DISABLE_THIS_TEST OFF) + set(DISABLE_THIS_TEST ON) # this test is currently unstable so we are disabling it + # unconditionally for now endif() set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1 @@ -46,7 +48,7 @@ set_tests_properties( WILL_FAIL TRUE DISABLED - "${IS_THREAD_SANITIZER}") + "${DISABLE_THIS_TEST}") add_test( NAME rocprofv3-test-validate-app-abort @@ -64,4 +66,4 @@ set_tests_properties( FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" DISABLED - "${IS_THREAD_SANITIZER}") + "${DISABLE_THIS_TEST}") diff --git a/tests/rocprofv3/memory-allocation/CMakeLists.txt b/tests/rocprofv3/memory-allocation/CMakeLists.txt index b05546f4..9a42261c 100644 --- a/tests/rocprofv3/memory-allocation/CMakeLists.txt +++ b/tests/rocprofv3/memory-allocation/CMakeLists.txt @@ -21,7 +21,7 @@ add_test( NAME rocprofv3-test-memory-allocation-tracing-execute COMMAND $ --memory-allocation-trace -d - ${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace otf2 + ${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json otf2 --log-level env -- $) set_tests_properties( @@ -35,8 +35,6 @@ add_test( COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.json - --pftrace-input - ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.pftrace --otf2-input ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.otf2) diff --git a/tests/rocprofv3/memory-allocation/conftest.py b/tests/rocprofv3/memory-allocation/conftest.py index 8504f44c..00b58d05 100644 --- a/tests/rocprofv3/memory-allocation/conftest.py +++ b/tests/rocprofv3/memory-allocation/conftest.py @@ -17,12 +17,6 @@ def pytest_addoption(parser): default="memory-allocation-tracing/out_results.json", help="Input JSON", ) - parser.addoption( - "--pftrace-input", - action="store", - default="memory-allocation-tracing/out_results.pftrace", - help="Input JSON", - ) parser.addoption( "--otf2-input", action="store", @@ -38,12 +32,6 @@ def json_data(request): return dotdict(collapse_dict_list(json.load(inp))) -@pytest.fixture -def pftrace_data(request): - filename = request.config.getoption("--pftrace-input") - return PerfettoReader(filename).read()[0] - - @pytest.fixture def otf2_data(request): filename = request.config.getoption("--otf2-input") diff --git a/tests/rocprofv3/memory-allocation/validate.py b/tests/rocprofv3/memory-allocation/validate.py index b2448106..80b833ac 100755 --- a/tests/rocprofv3/memory-allocation/validate.py +++ b/tests/rocprofv3/memory-allocation/validate.py @@ -35,7 +35,7 @@ def test_memory_allocation(json_data): _, bf_op_names = get_operation(data, "MEMORY_ALLOCATION") - assert len(bf_op_names) == 4 + assert len(bf_op_names) == 5 allocation_reported_agent_ids = set() # check buffering data @@ -49,12 +49,12 @@ def test_memory_allocation(json_data): assert "thread_id" in node assert "agent_id" in node - assert "starting_address" in node + assert "address" in node assert "allocation_size" in node assert node.size > 0 - assert node.allocation_size > 0 - assert node.starting_address > 0 + assert node.allocation_size >= 0 + assert len(node.address) > 0 assert node.thread_id > 0 assert node.agent_id.handle > 0 assert node.start_timestamp > 0 @@ -69,14 +69,6 @@ def test_memory_allocation(json_data): allocation_reported_agent_ids.add(node["agent_id"]["handle"]) - assert 2**64 - 1 not in allocation_reported_agent_ids - - -def test_perfetto_data(pftrace_data, json_data): - import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 - - rocprofv3.test_perfetto_data(pftrace_data, json_data, ("memory_allocation",)) - def test_otf2_data(otf2_data, json_data): import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 diff --git a/tests/rocprofv3/summary/validate.py b/tests/rocprofv3/summary/validate.py index 2adc1666..57648455 100644 --- a/tests/rocprofv3/summary/validate.py +++ b/tests/rocprofv3/summary/validate.py @@ -242,12 +242,12 @@ def get_dims(df): assert get_dims(marker) == [7, 9], f"{marker}" assert get_dims(memcpy) == [2, 9], f"{memcpy}" - assert get_dims(memalloc) == [1, 9], f"{memalloc}" + assert get_dims(memalloc) == [2, 9], f"{memalloc}" assert get_dims(dispatch) == [3, 9], f"{dispatch}" assert get_dims(dispatch_and_copy) == [5, 9], f"{dispatch_and_copy}" assert get_dims(hip) == [14, 9], f"{hip}" assert get_dims(hip_and_marker) == expected_hip_and_marker_dims, f"{hip_and_marker}" - assert get_dims(total) == [24, 9], f"{total}" + assert get_dims(total) == [25, 9], f"{total}" def test_perfetto_data(pftrace_data, json_data): @@ -256,7 +256,7 @@ def test_perfetto_data(pftrace_data, json_data): rocprofv3.test_perfetto_data( pftrace_data, json_data, - ("hip", "marker", "kernel", "memory_copy", "memory_allocation"), + ("hip", "marker", "kernel", "memory_copy"), ) diff --git a/tests/tools/json-tool.cpp b/tests/tools/json-tool.cpp index fe3bc102..8b9f93c7 100644 --- a/tests/tools/json-tool.cpp +++ b/tests/tools/json-tool.cpp @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1843,6 +1844,7 @@ write_perfetto() auto tids = std::set{}; auto agent_ids = std::set{}; + auto agent_ids_alloc = std::set{}; auto agent_queue_ids = std::map>{}; auto _get_agent = [](uint64_t id_handle) -> const rocprofiler_agent_t* { @@ -1875,7 +1877,7 @@ write_perfetto() for(auto itr : memory_allocation_bf_records) { tids.emplace(itr.thread_id); - agent_ids.emplace(itr.agent_id.handle); + agent_ids_alloc.emplace(itr.agent_id.handle); } for(auto itr : kernel_dispatch_bf_records) @@ -1934,6 +1936,36 @@ write_perfetto() agent_tracks.emplace(itr, _track); } + for(auto itr : agent_ids_alloc) + { + const auto* _agent = _get_agent(itr); + auto _namess = std::stringstream{}; + + if(_agent != nullptr) + { + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _namess << "CPU MEMORY OPERATION [" << itr << "] "; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _namess << "GPU MEMORY OPERATION [" << itr << "] "; + + if(!std::string_view{_agent->model_name}.empty()) + _namess << _agent->model_name; + else + _namess << _agent->product_name; + } + else + { + _namess << "UNKNOWN MEMORY OPERATION [" << itr << "] "; + } + auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; + auto _desc = _track.Serialize(); + _desc.set_name(_namess.str()); + + perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); + + agent_tracks.emplace(itr, _track); + } + auto agent_queue_tracks = std::unordered_map>{}; @@ -2155,35 +2187,6 @@ write_perfetto() itr.end_timestamp); } - for(auto itr : memory_allocation_bf_records) - { - auto name = buffer_names.at(itr.kind, itr.operation); - auto& track = agent_tracks.at(itr.agent_id.handle); - - TRACE_EVENT_BEGIN(sdk::perfetto_category::name, - ::perfetto::StaticString(name.data()), - track, - itr.start_timestamp, - ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), - "begin_ns", - itr.start_timestamp, - "kind", - itr.kind, - "operation", - itr.operation, - "agent", - agents_map.at(itr.agent_id).logical_node_id, - "Allocation_size", - itr.allocation_size, - "Starting_address", - itr.starting_address); - TRACE_EVENT_END(sdk::perfetto_category::name, - track, - itr.end_timestamp, - "end_ns", - itr.end_timestamp); - } - auto demangled = std::unordered_map{}; for(auto itr : kernel_dispatch_bf_records) {