Skip to content

Commit

Permalink
[AFAR VII] rocprofiler_sample_device_counting_service return data as …
Browse files Browse the repository at this point in the history
…part of API call (#57)


---------

Co-authored-by: Benjamin Welton <[email protected]>
Co-authored-by: Benjamin Welton <[email protected]>
  • Loading branch information
3 people authored Dec 7, 2024
1 parent bd33176 commit 253c9ad
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 54 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
- Changed naming of "dispatch profiling service" to a more descriptive "dispatch counting service". To convert existing tool or user code to the new names, the following sed can be used: `-type f -exec sed -i -e 's/dispatch_profile_counting_service/dispatch_counting_service/g' -e 's/dispatch_profile.h/dispatch_counting_service.h/g' -e 's/rocprofiler_profile_counting_dispatch_callback_t/rocprofiler_dispatch_counting_service_callback_t/g' -e 's/rocprofiler_profile_counting_dispatch_data_t/rocprofiler_dispatch_counting_service_data_t/g' -e 's/rocprofiler_profile_counting_dispatch_record_t/rocprofiler_dispatch_counting_service_record_t/g' {} +`
- `FETCH_SIZE` metric on gfx94x now uses `TCC_BUBBLE` for 128B reads.
- PMC dispatch-based counter collection serialization is now per-device instead of being global across all devices.
- Added output return functionality to rocprofiler_sample_device_counting_service
- Added rocprofiler_load_counter_definition.

### Resolved issues
Expand Down
2 changes: 1 addition & 1 deletion samples/counter_collection/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
// below to select the profile config to use when a kernel dispatch is
// recieved.
get_profile_cache().emplace(
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"SQ_WAVES"}));
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"TCC_HIT"}));
}

auto client_thread = rocprofiler_callback_thread_t{};
Expand Down
7 changes: 5 additions & 2 deletions samples/counter_collection/device_counting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,11 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
rocprofiler_start_context(get_client_ctx());
while(exit_toggle().load() == false)
{
rocprofiler_sample_device_counting_service(
get_client_ctx(), {.value = count}, ROCPROFILER_COUNTER_FLAG_NONE);
rocprofiler_sample_device_counting_service(get_client_ctx(),
{.value = count},
ROCPROFILER_COUNTER_FLAG_NONE,
nullptr,
nullptr);
count++;
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
Expand Down
16 changes: 13 additions & 3 deletions source/include/rocprofiler-sdk/device_counting_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,28 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
* @param [in] context_id context id
* @param [in] user_data User supplied data, included in records outputted to buffer.
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
* @param [in/out] output_records Output records collected via sampling (output is also written to
* buffer). Must be allocated by caller.
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
* output_records. On exit, contains the number of actual records.
* @return ::rocprofiler_status_t
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or
* the context is not configured for agent profiling.
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR Returned if another operation is in progress (
* start/stop ctx or another read).
* @retval ::ROCPROFILER_STATUS_ERROR Returned if HSA has not been initialized yet.
* @retval ::ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES Returned output_records is set but size is
* too small to store results
* @retval ::ROCPROFILER_STATUS_SUCCESS Returned if read request was successful.
* @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT Returned If ASYNC is being used while
* output_records is not null.
*/
rocprofiler_status_t
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags) ROCPROFILER_API;
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
rocprofiler_record_counter_t* output_records,
size_t* rec_count) ROCPROFILER_API;

/** @} */

Expand Down
52 changes: 28 additions & 24 deletions source/lib/rocprofiler-sdk/counters/device_counting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
{
val.user_data = callback_data.user_data;
val.agent_id = prof_config->agent->id;
if(callback_data.cached_counters)
{
callback_data.cached_counters->push_back(val);
}
buf->emplace(
ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, val);
}
Expand Down Expand Up @@ -253,9 +257,10 @@ init_callback_data(rocprofiler::counters::agent_callback_data& callback_data,
* and trigger the async handler manually.
*/
rocprofiler_status_t
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags)
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
std::vector<rocprofiler_record_counter_t>* out_counters)
{
rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS;
if(!ctx->device_counter_collection)
Expand All @@ -282,6 +287,18 @@ read_agent_ctx(const context::context* ctx,

for(auto& callback_data : agent_ctx.agent_data)
{
auto wait_if_sync = [&]() {
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
};

if(!callback_data.profile || !callback_data.set_profile) continue;
const auto* agent = agent::get_agent_cache(callback_data.profile->agent);

Expand All @@ -295,23 +312,19 @@ read_agent_ctx(const context::context* ctx,
// No AQL packet, nothing to do here.
if(!callback_data.packet) continue;

wait_if_sync();

if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
callback_data.cached_counters = out_counters;

// If we have no hardware counters but a packet. The caller is expecting
// non-hardware based counter values to be returned. We can skip packet injection
// and trigger the async handler directly
if(callback_data.profile->reqired_hw_counters.empty())
{
callback_data.user_data = user_data;
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, -1);
// Wait for the barrier/read packet to complete
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
wait_if_sync();
continue;
}

Expand All @@ -334,17 +347,8 @@ read_agent_ctx(const context::context* ctx,
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, 0);
callback_data.user_data = user_data;
submitPacket(agent->profile_queue(), &barrier.barrier_and);

// Wait for the barrier/read packet to complete
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
wait_if_sync();
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) callback_data.cached_counters = nullptr;
}

agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::ENABLED);
Expand Down
19 changes: 11 additions & 8 deletions source/lib/rocprofiler-sdk/counters/device_counting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <cstddef>

namespace rocprofiler
{
Expand Down Expand Up @@ -55,11 +56,12 @@ struct agent_callback_data
rocprofiler_user_data_t user_data = {.value = 0};
rocprofiler_user_data_t callback_data = {.value = 0};

std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
rocprofiler_agent_id_t agent_id = {.handle = 0};
rocprofiler_device_counting_service_callback_t cb = nullptr;
rocprofiler_buffer_id_t buffer = {.handle = 0};
bool set_profile = false;
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
rocprofiler_agent_id_t agent_id = {.handle = 0};
rocprofiler_device_counting_service_callback_t cb = nullptr;
rocprofiler_buffer_id_t buffer = {.handle = 0};
bool set_profile = false;
std::vector<rocprofiler_record_counter_t>* cached_counters = nullptr;

agent_callback_data() = default;
agent_callback_data(agent_callback_data&& rhs) noexcept
Expand Down Expand Up @@ -115,9 +117,10 @@ stop_agent_ctx(const context::context* ctx);
// read calls are not allowed in ASYNC mode and will result in
// this call waiting for the previous sample to complete.
rocprofiler_status_t
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags);
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
std::vector<rocprofiler_record_counter_t>* out_counters);

uint64_t
submitPacket(hsa_queue_t* queue, const void* packet);
Expand Down
63 changes: 51 additions & 12 deletions source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ test_init()
hsa::get_queue_controller()->init(get_api_table(), get_ext_table());
}

std::vector<rocprofiler_record_counter_t>&
common::Synchronized<std::vector<rocprofiler_record_counter_t>>&
global_recs()
{
static std::vector<rocprofiler_record_counter_t> recs;
static common::Synchronized<std::vector<rocprofiler_record_counter_t>> recs;
return recs;
}

Expand Down Expand Up @@ -146,7 +146,7 @@ check_output_created(rocprofiler_context_id_t,
}
found_value = record->user_data.value;
// ROCP_ERROR << fmt::format("Found counter value: {}", record->counter_value);
global_recs().push_back(*record);
global_recs().wlock([&](auto& data) { data.push_back(*record); });
}
}

Expand Down Expand Up @@ -319,6 +319,7 @@ class device_counting_service_test : public ::testing::Test
size_t track_metric = 0;
for(auto& metric : metrics)
{
std::vector<rocprofiler_record_counter_t> output_records(10000);
// global_recs().clear();
track_metric++;
ROCP_ERROR << "Testing metric " << metric.name();
Expand Down Expand Up @@ -402,9 +403,23 @@ class device_counting_service_test : public ::testing::Test
HSA_WAIT_STATE_BLOCKED);

// Sample the counting service.
ROCPROFILER_CALL(
rocprofiler_sample_device_counting_service(ctx, {.value = track_metric}, flags),
"Could not sample");

if(flags == ROCPROFILER_COUNTER_FLAG_ASYNC)
{
ROCPROFILER_CALL(rocprofiler_sample_device_counting_service(
ctx, {.value = track_metric}, flags, nullptr, nullptr),
"Could not sample");
}
else
{
global_recs().wlock([&](auto& _data) { _data.clear(); });
size_t out_count = output_records.size();
ROCPROFILER_CALL(
rocprofiler_sample_device_counting_service(
ctx, {.value = track_metric}, flags, output_records.data(), &out_count),
"Could not sample");
output_records.resize(out_count);
}
ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "Could not stop context");
rocprofiler_flush_buffer(opt_buff_id);

Expand All @@ -417,6 +432,27 @@ class device_counting_service_test : public ::testing::Test
{
ROCP_FATAL << "Failed to get data for " << metric.name();
}
else if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
auto recs_local = global_recs().rlock([](const auto& data) { return data; });

if(recs_local.size() != output_records.size())
{
ROCP_FATAL << "Output size does not match: " << recs_local.size() << " "
<< output_records.size();
}
if(!std::equal(recs_local.begin(),
recs_local.end(),
output_records.begin(),
[](const auto& a, const auto& b) {
return a.id == b.id && a.counter_value == b.counter_value &&
a.dispatch_id == b.dispatch_id &&
a.agent_id.handle == b.agent_id.handle;
}))
{
ROCP_FATAL << "Output does not match between buffer and callback";
}
}
}
hsa_signal_destroy(completion_signal);
hsa_signal_destroy(found_data);
Expand Down Expand Up @@ -599,9 +635,10 @@ TEST_F(device_counting_service_test, async_counters) { test_run(ROCPROFILER_COUN
TEST_F(device_counting_service_test, sync_grbm_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GRBM_COUNT"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();

for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
Expand All @@ -615,9 +652,10 @@ TEST_F(device_counting_service_test, sync_grbm_verify)
TEST_F(device_counting_service_test, sync_gpu_util_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GPU_UTIL"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();

for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
Expand All @@ -631,9 +669,10 @@ TEST_F(device_counting_service_test, sync_gpu_util_verify)
TEST_F(device_counting_service_test, sync_sq_waves_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"SQ_WAVES_sum"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();

for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
Expand Down
34 changes: 30 additions & 4 deletions source/lib/rocprofiler-sdk/device_counting_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
#include "rocprofiler-sdk/fwd.h"

#include <string.h>

extern "C" {
rocprofiler_status_t
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
Expand All @@ -40,11 +42,35 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
}

rocprofiler_status_t
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags)
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
rocprofiler_record_counter_t* output_records,
size_t* rec_count)
{
if(output_records != nullptr)
{
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
CHECK(rec_count);
auto recs = std::vector<rocprofiler_record_counter_t>{};
auto status = rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
if(status == ROCPROFILER_STATUS_SUCCESS)
{
if(recs.size() > *rec_count)
{
*rec_count = recs.size();
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
}
*rec_count = recs.size();
std::memcpy(
output_records, recs.data(), sizeof(rocprofiler_record_counter_t) * recs.size());
}
return status;
}

return rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags);
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
}
}

0 comments on commit 253c9ad

Please sign in to comment.