Skip to content

Commit

Permalink
Omit CUDA sync events on unused CUDA streams (#795)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #795

CUDA WaitforStream sync events can lead to a lot of verbosity in the traces.
It looks like PyTorch cycles through all the streams to add these wait calls but they are no-ops on stream that are not in active use.
 {F1063309394}

This change will simply filter out WaitforStream events whenever a CUDA stream has not been used.
* Track device, stream used in a set, this updated by an GPU kernel or memcpy or memset.
* Filter waitforstream events.

Reviewed By: chaekit, davidberard98

Differential Revision: D47975637

fbshipit-source-id: 26796b6e703f12cdb91bbe99c75db6d89a4c866f
  • Loading branch information
briancoutinho authored and facebook-github-bot committed Aug 8, 2023
1 parent 8d4234c commit f6c845e
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 13 deletions.
33 changes: 25 additions & 8 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ struct CtxEventPair {

template<>
struct std::hash<CtxEventPair> {
std::size_t operator()(const CtxEventPair& c) const {
return std::hash<uint32_t>()(c.ctx) ^ std::hash<uint32_t>()(c.eventId);
}
std::size_t operator()(const CtxEventPair& c) const {
return KINETO_NAMESPACE::detail::hash_combine(
std::hash<uint32_t>()(c.ctx),
std::hash<uint32_t>()(c.eventId)
);
}
};

namespace {
Expand Down Expand Up @@ -508,15 +511,26 @@ void CuptiActivityProfiler::handleCudaSyncActivity(
<< " eventId=" << activity->cudaEventId
<< " contextId=" << activity->contextId;

if(!config_->activitiesCudaSyncWaitEvents() && isEventSync(activity->type)) {
if (!config_->activitiesCudaSyncWaitEvents() && isEventSync(activity->type)) {
return;
}

auto device_id = contextIdtoDeviceId(activity->contextId);

// Event Sync events tend to be noisy, only pass these events if
// there was some GPU kernel/memcopy/memset observed on it till now.
if (isEventSync(activity->type) &&
(seenDeviceStreams_.find({device_id, activity->streamId}) ==
seenDeviceStreams_.end())) {
VLOG(2) << "Skipping Event Sync (corrId=" << activity->correlationId
<< ") as no kernels have run yet on stream = " << activity->streamId;
return;
}

if(int32_t(activity->streamId) != -1) {
recordStream(
contextIdtoDeviceId(activity->contextId), activity->streamId, "");
if (int32_t(activity->streamId) != -1) {
recordStream(device_id, activity->streamId, "");
} else {
recordDevice(contextIdtoDeviceId(activity->contextId));
recordDevice(device_id);
}

const ITraceActivity* linked =
Expand Down Expand Up @@ -590,6 +604,8 @@ inline void CuptiActivityProfiler::handleGpuActivity(
checkTimestampOrder(&act);
VLOG(2) << act.correlationId() << ": " << act.name();
recordStream(act.deviceId(), act.resourceId(), "");
seenDeviceStreams_.insert({act.deviceId(), act.resourceId()});

act.log(*logger);
updateGpuNetSpan(act);
if (derivedConfig_->profileActivityTypes().count(
Expand Down Expand Up @@ -1090,6 +1106,7 @@ void CuptiActivityProfiler::resetTraceData() {
gpuUserEventMap_.clear();
traceSpans_.clear();
clientActivityTraceMap_.clear();
seenDeviceStreams_.clear();
traceBuffers_ = nullptr;
metadata_.clear();
sessions_.clear();
Expand Down
28 changes: 28 additions & 0 deletions libkineto/src/CuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ struct ConfigDerivedState final {
bool profilingByIter_{false};
};

namespace detail {
inline size_t hash_combine(size_t seed, size_t value) {
return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
}
} // namespace detail

class CuptiActivityProfiler {
public:
CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly);
Expand Down Expand Up @@ -420,6 +426,28 @@ class CuptiActivityProfiler {
// span name -> iteration count
std::map<std::string, int> iterationCountMap_;

struct DevStream {
int64_t ctx = 0;
int64_t stream = 0;
bool operator==(const DevStream& other) const {
return (this->ctx == other.ctx) && (this->stream == other.stream);
}
};

struct DevStreamHash {
std::size_t operator()(const DevStream& c) const {
return detail::hash_combine(
std::hash<int64_t>()(c.ctx),
std::hash<int64_t>()(c.stream)
);
}
};

// This set tracks the (device, cuda streams) observed in the trace
// doing CUDA kernels/memcopies. This prevents emitting CUDA sync
// events on streams with no activity.
std::unordered_set<DevStream, DevStreamHash> seenDeviceStreams_;

// Buffers where trace data is stored
std::unique_ptr<ActivityBuffers> traceBuffers_;

Expand Down
18 changes: 13 additions & 5 deletions libkineto/test/CuptiActivityProfilerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,13 @@ struct MockCuptiActivityBuffer {

void addSyncActivity(
int64_t start_us, int64_t end_us, int64_t correlation,
CUpti_ActivitySynchronizationType type) {
CUpti_ActivitySynchronizationType type, int64_t stream = 1) {
auto& act = createActivity<CUpti_ActivitySynchronization>(
start_us, end_us, correlation);
act.kind = CUPTI_ACTIVITY_KIND_SYNCHRONIZATION;
act.type = type;
act.contextId = 0;
act.streamId = 1;
act.streamId = stream;
activities.push_back(reinterpret_cast<CUpti_Activity*>(&act));
}

Expand Down Expand Up @@ -434,6 +434,14 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {
gpuOps->addKernelActivity(260, 320, 3);
gpuOps->addKernelActivity(330, 350, 4);
gpuOps->addSyncActivity(321, 323, 5, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE);
// Add wait event on kernel stream 1
gpuOps->addSyncActivity(
324, 326, 6, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
1 /*stream*/);
// This event should be ignored because it is not on a stream that has no GPU kernels
gpuOps->addSyncActivity(
326, 330, 7, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
4 /*stream*/);
cuptiActivities_.activityBuffer = std::move(gpuOps);

// Have the profiler process them
Expand All @@ -445,7 +453,7 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {

// Wrapper that allows iterating over the activities
ActivityTrace trace(std::move(logger), loggerFactory);
EXPECT_EQ(trace.activities()->size(), 14);
EXPECT_EQ(trace.activities()->size(), 15);
std::map<std::string, int> activityCounts;
std::map<int64_t, int> resourceIds;
for (auto& activity : *trace.activities()) {
Expand All @@ -470,8 +478,8 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {
auto sysTid = systemThreadId();
// Ops and runtime events are on thread sysTid along with the flow start events
EXPECT_EQ(resourceIds[sysTid], 9);
// Kernels and sync event are on stream 1, memcpy on stream 2
EXPECT_EQ(resourceIds[1], 4);
// Kernels and sync events are on stream 1, memcpy on stream 2
EXPECT_EQ(resourceIds[1], 5);
EXPECT_EQ(resourceIds[2], 1);

#ifdef __linux__
Expand Down

0 comments on commit f6c845e

Please sign in to comment.