Skip to content

Commit

Permalink
rocprofv3: refactor and reorganize rocprofiler-sdk-tool library (#1138)
Browse files Browse the repository at this point in the history
* Add rocprofv3-multi-node.md to source/lib/rocprofiler-sdk-tool

* Initial source re-organization

- create "output" static library

* Update include/rocprofiler-sdk/cxx/serialization.hpp

- add GPR count fields to kernel symbol serialization

* Add source/scripts/generate-rocpd.py

- reads one or more JSON output files from rocprofv3 and writes rocpd SQLite3 database
- Note: preliminary implementation

* More reorganization b/t lib/rocprofiler-sdk-tool and lib/output

* Updates to generate-rocpd.py

- add SQL views
- option: --absolute-timestamps -> --normalize-timestamps
- option: --generic-markers
- misc fixes with regards to getting the views working
- support marker names

* Update generate-rocpd.py

- Add --marker-mode option

* Update generate-rocpd.py

- Improve debugging of bad bulk SQLite statements

* Update rocprofv3-multi-node.md

- cleanup of proposed SQL schema

* lib/output/format_path.{hpp,cpp}

- rename format to format_path (in config.hpp and config.cpp)
- move format_path functionality to format_path.{hpp,cpp}

* Rework lib/output/tmp_file_buffer.{hpp,cpp}

* Update output_key.cpp

- support %cwd%, %launch_date%

* Rework lib/output/buffered_output.hpp

* Support csv_output_file constructed via domain_type

* Update lib/output/domain_type.{hpp,cpp}

- get_domain_trace_file_name
- get_domain_stats_file_name

* Update lib/rocprofiler-sdk-tool/tool.cpp

- tweak headers

* Update lib/output/generate*.cpp

- remove include of helpers.hpp
- CSV uses domain_type for filenames

* Update samples/counter_collection/per_dev_serialization.cpp

- make wait_on volatile

* Remove tool_table from lib/output and lib/rocprofiler-sdk-tool

- Also split various structs into their own files
  - lib/output/agent_info
  - lib/output/metadata
  - lib/output/kernel_symbol_info
  - lib/output/counter_info
- Implemented rocprofiler::tool::metadata

* Optimize rocprofiler_tool_counter_collection_record_t

- reduce the size of the struct from 24784 bytes to 8376 bytes

* Introduced output_config

- split subset of config (from tools library) into output_config to be able to configure the output generating functions separately from the tool library
- this is a significant step towards the output generating functions not relying on static global memory

* Stream chunks of data into output instead of loading all info memory

* Remove duplicate group_segment_size in rocprofiler_kernel_dispatch_info_t serialization

* Adding Q&A to rocprofv3-multi-node.md

* Remove all remaining include lib/rocprofiler-sdk-tool from lib/output

- migrated a fair amount of code from lib/rocprofiler-sdk-tool/helper.hpp to lib/output

* Update Q&A of rocprofv3-multi-node.md

* Fix minor compilation errors + minor cleanup

* Update hsa/async_copy.cpp

- when ROCPROFILER_CI_STRICT_TIMESTAMPS > 0, reduce the active_signal sync wait time

* Update profiling_time.hpp

- fix log messages for when start/end time is less/greater than enqueue/current CPU time

* Fix generate_stats for tool_counter_record_t

* Dictionary optimization for generate-rocpd.py

---------

Co-authored-by: SrirakshaNag <[email protected]>
  • Loading branch information
jrmadsen and SrirakshaNag authored Nov 7, 2024
1 parent 98858b6 commit 5eb8c26
Show file tree
Hide file tree
Showing 59 changed files with 5,524 additions and 3,294 deletions.
2 changes: 1 addition & 1 deletion samples/advanced_thread_trace/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ target_link_libraries(
PRIVATE rocprofiler-sdk::rocprofiler-sdk amd_comgr
rocprofiler-sdk::samples-common-library rocprofiler-sdk::samples-build-flags)

rocprofiler_samples_get_preload_env(PRELOAD_ENV advanced-thread-trace)
rocprofiler_samples_get_preload_env(PRELOAD_ENV)

add_test(NAME advanced-thread-trace COMMAND $<TARGET_FILE:advanced-thread-trace>)

Expand Down
2 changes: 1 addition & 1 deletion samples/code_object_isa_decode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ target_link_libraries(
PRIVATE rocprofiler-sdk::samples-common-library rocprofiler-sdk::rocprofiler-sdk
amd_comgr rocprofiler-sdk::samples-build-flags)

rocprofiler_samples_get_preload_env(PRELOAD_ENV code-object-isa-decode)
rocprofiler_samples_get_preload_env(PRELOAD_ENV)

add_test(NAME code-object-isa-decode COMMAND $<TARGET_FILE:code-object-isa-decode>)

Expand Down
23 changes: 16 additions & 7 deletions samples/counter_collection/per_dev_serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@
} while(0)

__global__ void
kernelA(int* wait_on, int value, int* no_opt)
kernelA(int devid, volatile int* wait_on, int value, int* no_opt)
{
printf("[device=%i][begin] Wait on %i: %i (%i)\n", devid, value, *wait_on, *no_opt);
while(*wait_on != value)
{
(*no_opt)++;
};
printf("[device=%i][break] Wait on %i: %i (%i)\n", devid, value, *wait_on, *no_opt);
(*wait_on)--;
printf("[device=%i][return] Wait on %i: %i (%i)\n", devid, value, *wait_on, *no_opt);
}

int
Expand All @@ -53,17 +56,23 @@ main(int, char**)
if(ntotdevice < 2) return 0;

start();
int* check_value = nullptr;
int* no_opt = nullptr;
volatile int* check_value = nullptr;
int* no_opt_0 = nullptr;
int* no_opt_1 = nullptr;
HIP_CALL(hipMallocManaged(&check_value, sizeof(*check_value)));
HIP_CALL(hipMallocManaged(&no_opt, sizeof(*no_opt)));
*no_opt = 0;
HIP_CALL(hipMallocManaged(&no_opt_0, sizeof(*no_opt_0)));
HIP_CALL(hipMallocManaged(&no_opt_1, sizeof(*no_opt_1)));
*no_opt_0 = 0;
*no_opt_1 = 0;
*check_value = 1;

// Will hang if per-device serialization is not functional
HIP_CALL(hipSetDevice(0));
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, check_value, 0, no_opt);
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, 0, check_value, 0, no_opt_0);

HIP_CALL(hipSetDevice(1));
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, check_value, 1, no_opt);
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, 1, check_value, 1, no_opt_1);

HIP_CALL(hipSetDevice(0));
HIP_CALL(hipDeviceSynchronize());

Expand Down
4 changes: 3 additions & 1 deletion source/include/rocprofiler-sdk/cxx/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_code_object_kernel_symbol_regist
ROCP_SDK_SAVE_DATA_FIELD(kernarg_segment_alignment);
ROCP_SDK_SAVE_DATA_FIELD(group_segment_size);
ROCP_SDK_SAVE_DATA_FIELD(private_segment_size);
ROCP_SDK_SAVE_DATA_FIELD(sgpr_count);
ROCP_SDK_SAVE_DATA_FIELD(arch_vgpr_count);
ROCP_SDK_SAVE_DATA_FIELD(accum_vgpr_count);
}

template <typename ArchiveT>
Expand Down Expand Up @@ -301,7 +304,6 @@ save(ArchiveT& ar, rocprofiler_kernel_dispatch_info_t data)
ROCP_SDK_SAVE_DATA_FIELD(private_segment_size);
ROCP_SDK_SAVE_DATA_FIELD(group_segment_size);
ROCP_SDK_SAVE_DATA_FIELD(workgroup_size);
ROCP_SDK_SAVE_DATA_FIELD(group_segment_size);
ROCP_SDK_SAVE_DATA_FIELD(grid_size);
}

Expand Down
1 change: 1 addition & 0 deletions source/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#
set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "core")
add_subdirectory(common)
add_subdirectory(output)
add_subdirectory(rocprofiler-sdk)

set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "tools")
Expand Down
11 changes: 11 additions & 0 deletions source/lib/common/synchronized.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class Synchronized
// Do not allow this data structure to be copied, std::move only.
Synchronized(const Synchronized&) = delete;

// return a copy of the data
value_type get() const;

template <typename FuncT, typename... Args>
decltype(auto) rlock(FuncT&& lambda, Args&&... args) const;

Expand Down Expand Up @@ -100,6 +103,14 @@ class Synchronized
//
// member definitions
//
template <typename LockedType, bool IsMappedTypeV>
typename Synchronized<LockedType, IsMappedTypeV>::value_type
Synchronized<LockedType, IsMappedTypeV>::get() const
{
auto lock = std::shared_lock{m_mutex};
return m_data;
}

template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args>
decltype(auto)
Expand Down
61 changes: 61 additions & 0 deletions source/lib/output/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#
# Tool library used by rocprofiler
#

rocprofiler_activate_clang_tidy()

set(TOOL_OUTPUT_HEADERS
agent_info.hpp
buffered_output.hpp
counter_info.hpp
csv.hpp
csv_output_file.hpp
domain_type.hpp
format_path.hpp
generateCSV.hpp
generateJSON.hpp
generateOTF2.hpp
generatePerfetto.hpp
generateStats.hpp
generator.hpp
kernel_symbol_info.hpp
metadata.hpp
output_config.hpp
output_key.hpp
output_stream.hpp
statistics.hpp
timestamps.hpp
tmp_file_buffer.hpp
tmp_file.hpp)

set(TOOL_OUTPUT_SOURCES
csv_output_file.cpp
domain_type.cpp
format_path.cpp
generateCSV.cpp
generateJSON.cpp
generateOTF2.cpp
generatePerfetto.cpp
generateStats.cpp
metadata.cpp
output_config.cpp
output_key.cpp
output_stream.cpp
statistics.cpp
tmp_file_buffer.cpp
tmp_file.cpp)

add_library(rocprofiler-sdk-output-library STATIC)
add_library(rocprofiler-sdk::rocprofiler-sdk-output-library ALIAS
rocprofiler-sdk-output-library)
target_sources(rocprofiler-sdk-output-library PRIVATE ${TOOL_OUTPUT_SOURCES}
${TOOL_OUTPUT_HEADERS})
target_link_libraries(
rocprofiler-sdk-output-library
PRIVATE rocprofiler-sdk::rocprofiler-sdk-headers
rocprofiler-sdk::rocprofiler-sdk-build-flags
rocprofiler-sdk::rocprofiler-sdk-memcheck
rocprofiler-sdk::rocprofiler-sdk-common-library
rocprofiler-sdk::rocprofiler-sdk-cereal
rocprofiler-sdk::rocprofiler-sdk-perfetto
rocprofiler-sdk::rocprofiler-sdk-otf2)
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,50 @@

#pragma once

#include "helper.hpp"
#include "statistics.hpp"
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/cxx/serialization.hpp>

#include <unordered_map>
#include <vector>
#include "rocprofiler-sdk/fwd.h"

namespace rocprofiler
{
namespace tool
{
void
write_json(tool_table* tool_functions,
uint64_t pid,
const domain_stats_vec_t& domain_stats,
std::vector<rocprofiler_agent_v0_t> agent_data,
std::vector<rocprofiler_tool_counter_info_t> counter_data,
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_deque,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_deque,
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_deque,
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_deque,
std::deque<rocprofiler_tool_counter_collection_record_t>* counter_collection_deque,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_deque,
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* scratch_memory_deque,
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_deque);
struct agent_info : rocprofiler_agent_v0_t
{
using base_type = rocprofiler_agent_v0_t;

agent_info(base_type _base)
: base_type{_base}
{}

~agent_info() = default;
agent_info(const agent_info&) = default;
agent_info(agent_info&&) noexcept = default;
agent_info& operator=(const agent_info&) = default;
agent_info& operator=(agent_info&&) noexcept = default;

int64_t gpu_index =
(base_type::type == ROCPROFILER_AGENT_TYPE_GPU) ? base_type::logical_node_type_id : -1;
};

using agent_info_vec_t = std::vector<agent_info>;
using agent_info_map_t = std::unordered_map<rocprofiler_agent_id_t, agent_info>;
} // namespace tool
} // namespace rocprofiler

namespace cereal
{
#define SAVE_DATA_FIELD(FIELD) ar(make_nvp(#FIELD, data.FIELD))

template <typename ArchiveT>
void
save(ArchiveT& ar, const ::rocprofiler::tool::agent_info& data)
{
cereal::save(ar, static_cast<const rocprofiler_agent_v0_t&>(data));
}

#undef SAVE_DATA_FIELD
} // namespace cereal
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

#pragma once

#include "helper.hpp"
#include "counter_info.hpp"
#include "generator.hpp"
#include "statistics.hpp"
#include "tmp_file_buffer.hpp"

Expand All @@ -31,6 +32,8 @@

#include <fmt/format.h>

#include <deque>

namespace rocprofiler
{
namespace tool
Expand All @@ -41,25 +44,27 @@ using stats_data_t = statistics<uint64_t, float_type>;
template <typename Tp, domain_type DomainT>
struct buffered_output
{
using ring_buffer_type = rocprofiler::common::container::ring_buffer<Tp>;
using type = Tp;
static constexpr auto buffer_type_v = DomainT;

explicit buffered_output(bool _enabled);
~buffered_output() = default;
buffered_output(const buffered_output&) = delete;
buffered_output(buffered_output&&) noexcept = delete;
buffered_output& operator=(const buffered_output&) = default;
buffered_output& operator=(buffered_output&&) noexcept = default;
buffered_output& operator=(const buffered_output&) = delete;
buffered_output& operator=(buffered_output&&) noexcept = delete;

operator bool() const { return enabled; }

void flush();
void read();
void clear();
void destroy();

operator bool() const { return enabled; }
generator<Tp> get_generator() const { return generator<Tp>{get_tmp_file_buffer<Tp>(DomainT)}; }
std::deque<Tp> load_all();

std::deque<Tp> element_data = {};
stats_entry_t stats = {};
stats_entry_t stats = {};

private:
bool enabled = false;
Expand All @@ -76,7 +81,7 @@ buffered_output<Tp, DomainT>::flush()
{
if(!enabled) return;

flush_tmp_buffer<ring_buffer_type>(buffer_type_v);
flush_tmp_buffer<type>(buffer_type_v);
}

template <typename Tp, domain_type DomainT>
Expand All @@ -87,16 +92,33 @@ buffered_output<Tp, DomainT>::read()

flush();

element_data = get_buffer_elements(read_tmp_file<ring_buffer_type>(buffer_type_v));
read_tmp_file<type>(buffer_type_v);
}

template <typename Tp, domain_type DomainT>
std::deque<Tp>
buffered_output<Tp, DomainT>::load_all()
{
auto data = std::deque<Tp>{};
if(enabled)
{
auto gen = get_generator();
for(auto ditr : gen)
{
for(auto itr : gen.get(ditr))
{
data.emplace_back(itr);
}
}
}
return data;
}

template <typename Tp, domain_type DomainT>
void
buffered_output<Tp, DomainT>::clear()
{
if(!enabled) return;

element_data.clear();
}

template <typename Tp, domain_type DomainT>
Expand All @@ -106,10 +128,30 @@ buffered_output<Tp, DomainT>::destroy()
if(!enabled) return;

clear();
auto [_tmp_buf, _tmp_file] = get_tmp_file_buffer<ring_buffer_type>(buffer_type_v);
_tmp_buf->destroy();
delete _tmp_buf;
delete _tmp_file;
auto*& filebuf = get_tmp_file_buffer<type>(buffer_type_v);
file_buffer<type>* tmp = nullptr;
std::swap(filebuf, tmp);
tmp->buffer.destroy();
delete tmp;
}

using hip_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_hip_api_record_t, domain_type::HIP>;
using hsa_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_hsa_api_record_t, domain_type::HSA>;
using kernel_dispatch_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_kernel_dispatch_record_t,
domain_type::KERNEL_DISPATCH>;
using memory_copy_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_memory_copy_record_t, domain_type::MEMORY_COPY>;
using marker_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_marker_api_record_t, domain_type::MARKER>;
using rccl_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_rccl_api_record_t, domain_type::RCCL>;
using counter_collection_buffered_output_t =
buffered_output<tool_counter_record_t, domain_type::COUNTER_COLLECTION>;
using scratch_memory_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_scratch_memory_record_t,
domain_type::SCRATCH_MEMORY>;
} // namespace tool
} // namespace rocprofiler
Loading

0 comments on commit 5eb8c26

Please sign in to comment.