Skip to content

Commit

Permalink
#1014 dump node stats before trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakub Strzebonski authored and cz4rs committed May 18, 2021
1 parent b23be5f commit a25f456
Show file tree
Hide file tree
Showing 7 changed files with 400 additions and 81 deletions.
9 changes: 8 additions & 1 deletion src/vt/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,12 @@ bool Runtime::finalize(bool const force_now) {
}

void Runtime::sync() {
MPI_Barrier(theContext->getComm());
MPI_Comm comm = theContext->getComm();
if (comm == MPI_COMM_NULL) {
vtAbort("Trying to sync runtime while the communicator is not available");
} else {
MPI_Barrier(comm);
}
}

void Runtime::runScheduler() {
Expand Down Expand Up @@ -1216,6 +1221,7 @@ void Runtime::initializeComponents() {
theTrace->initialize();
#endif
theEvent->initialize();
vrt::collection::balance::ProcStats::initialize();

debug_print(runtime, node, "end: initializeComponents\n");
}
Expand Down Expand Up @@ -1362,6 +1368,7 @@ void Runtime::finalizeComponents() {

// Finalize memory usage component
util::memory::MemoryUsage::finalize();
vrt::collection::balance::ProcStats::finalize();

debug_print(runtime, node, "end: finalizeComponents\n");
}
Expand Down
2 changes: 2 additions & 0 deletions src/vt/vrt/collection/balance/lb_invoke/invoke.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ void LBManager::releaseNow(PhaseType phase) {
balance::ProcStats::startIterCleanup();
balance::ProcStats::clearStats();

balance::ProcStats::outputStatsForPhase(phase);

auto msg = makeMessage<CollectionPhaseMsg>();

// Destruct the objgroup that was used for LB
Expand Down
132 changes: 74 additions & 58 deletions src/vt/vrt/collection/balance/proc_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@

namespace vt { namespace vrt { namespace collection { namespace balance {

using ArgType = vt::arguments::ArgConfig;

/*static*/
typename ProcStats::SparseMapType<std::unordered_map<ElementIDType,TimeType>>
ProcStats::proc_data_ = {};
Expand Down Expand Up @@ -146,7 +148,6 @@ ProcStats::getProcSubphaseLoad(PhaseType phase) {
}

/*static*/ void ProcStats::createStatsFile() {
using ArgType = vt::arguments::ArgConfig;
auto const node = theContext()->getNode();
auto const base_file = std::string(ArgType::vt_lb_stats_file);
auto const dir = std::string(ArgType::vt_lb_stats_dir);
Expand All @@ -155,7 +156,7 @@ ProcStats::getProcSubphaseLoad(PhaseType phase) {

debug_print(
lb, node,
"ProcStats: createStatsFile file={}\n", file_name
"ProcStats:: createStatsFile file={}\n", file_name
);

// Node 0 creates the directory
Expand All @@ -174,6 +175,7 @@ ProcStats::getProcSubphaseLoad(PhaseType phase) {
}

stats_file_ = fopen(file_name.c_str(), "w+");
vtAssertExpr(stats_file_ != nullptr);
}

/*static*/ void ProcStats::closeStatsFile() {
Expand All @@ -183,74 +185,88 @@ ProcStats::getProcSubphaseLoad(PhaseType phase) {
}
}

/*static*/ void ProcStats::outputStatsFile() {
if (stats_file_ == nullptr) {
createStatsFile();
std::pair<ElementIDType, ElementIDType>
getRecvSendDirection(CommKeyType const& comm) {
switch (comm.cat_) {
case CommCategory::SendRecv:
case CommCategory::Broadcast:
return std::make_pair(comm.toObj(), comm.fromObj());

case CommCategory::NodeToCollection:
case CommCategory::NodeToCollectionBcast:
return std::make_pair(comm.toObj(), comm.fromNode());

case CommCategory::CollectionToNode:
case CommCategory::CollectionToNodeBcast:
return std::make_pair(comm.toNode(), comm.fromObj());
}

vtAssertExpr(stats_file_ != nullptr);
vtAssert(false, "Invalid balance::CommCategory enum value");
return std::make_pair(ElementIDType{}, ElementIDType{});
}

auto const num_iters = proc_data_.size();
/*static*/ void ProcStats::initialize() {
#if backend_check_enabled(lblite)
if (ArgType::vt_lb_stats and stats_file_ == nullptr) {
createStatsFile();
}
#endif
}

/*static*/ void ProcStats::finalize() {
// If statistics are enabled, close output file and clear stats
#if backend_check_enabled(lblite)
if (ArgType::vt_lb_stats) {
closeStatsFile();
clearStats();
}
#endif
}

vt_print(lb, "ProcStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters);
/*static*/ void ProcStats::outputStatsForPhase(PhaseType phase) {
// Statistics output when LB is enabled and appropriate flag is enabled
if (!ArgType::vt_lb_stats) {
return;
}

for (size_t i = 0; i < num_iters; i++) {
for (auto&& elm : proc_data_[i]) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = proc_subphase_data_[i][id];
size_t subphases = subphase_times.size();
vtAssertExpr(stats_file_ != nullptr);
debug_print(lb, node, "ProcStats::outputStatsForPhase: phase={}\n", phase);

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}
for (auto&& elm : proc_data_.at(phase)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = proc_subphase_data_.at(phase)[id];
size_t subphases = subphase_times.size();

obj_str += "]\n";
auto obj_str = fmt::format("{},{},{},{},[", phase, id, time, subphases);

fprintf(stats_file_, "%s", obj_str.c_str());
}
for (auto&& elm : proc_comm_[i]) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
for (size_t s = 0; s < subphases; s++) {
if (s > 0) {
obj_str += ",";
}

obj_str += std::to_string(subphase_times[s]);
}

obj_str += "]\n";

fprintf(stats_file_, "%s", obj_str.c_str());
}

for (auto&& elm : proc_comm_.at(phase)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& comm = elm.first;
auto const recvSend = getRecvSendDirection(comm);
auto const cat = static_cast<E>(comm.cat_);
auto obj_str = fmt::format(
"{},{},{},{},{}\n", phase, recvSend.first, recvSend.second,
elm.second, cat
);
fprintf(stats_file_, "%s", obj_str.c_str());
}
fflush(stats_file_);

closeStatsFile();
fflush(stats_file_);
}

ElementIDType ProcStats::addProcStats(
Expand Down
4 changes: 3 additions & 1 deletion src/vt/vrt/collection/balance/proc_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,10 @@ struct ProcStats {
static void clearStats();
static void startIterCleanup();
static void releaseLB();
static void initialize();
static void finalize();

static void outputStatsFile();
static void outputStatsForPhase(PhaseType phase);

static SubphaseLoadMapType const& getProcSubphaseLoad(PhaseType phase);

Expand Down
8 changes: 0 additions & 8 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,6 @@ CollectionManager::CollectionManager() {
/*virtual*/ CollectionManager::~CollectionManager() {
cleanupAll<>();

// Statistics output when LB is enabled and appropriate flag is enabled
#if backend_check_enabled(lblite)
if (ArgType::vt_lb_stats) {
balance::ProcStats::outputStatsFile();
balance::ProcStats::clearStats();
}
#endif

// Destroy the LBManager
balance::LBManager::destroy();
}
Expand Down
Loading

0 comments on commit a25f456

Please sign in to comment.