Skip to content

Commit

Permalink
#1014 dump node stats before trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakub Strzebonski authored and Braden Mailloux committed Oct 15, 2020
1 parent 86b178f commit 580823d
Show file tree
Hide file tree
Showing 7 changed files with 301 additions and 86 deletions.
10 changes: 9 additions & 1 deletion src/vt/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,15 @@ bool Runtime::finalize(bool const force_now, bool const disable_sig) {
}

void Runtime::sync() {
MPI_Barrier(communicator_);
MPI_Comm comm = communicator_;
if (comm == MPI_COMM_NULL and theContext != nullptr) {
comm = theContext->getComm();
}
if (comm == MPI_COMM_NULL) {
vtAbort("Trying to sync runtime while the communicator is not available");
} else {
MPI_Barrier(comm);
}
}

void Runtime::runScheduler() {
Expand Down
2 changes: 2 additions & 0 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,8 @@ void LBManager::releaseNow(PhaseType phase) {
);
}

theNodeStats()->outputStatsForPhase(phase);

auto msg = makeMessage<CollectionPhaseMsg>();

// Destruct the objgroup that was used for LB
Expand Down
136 changes: 76 additions & 60 deletions src/vt/vrt/collection/balance/node_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,24 @@ void NodeStats::releaseLB() {
CollectionManager::releaseLBPhase(msg_hold.get());
}

void NodeStats::initialize() {
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->createStatsFile();
}
#endif
}

void NodeStats::createStatsFile() {
auto const node = theContext()->getNode();
auto const base_file = std::string(theConfig()->vt_lb_stats_file);
auto const dir = std::string(theConfig()->vt_lb_stats_dir);
auto const base_file = theConfig()->vt_lb_stats_file;
auto const dir = theConfig()->vt_lb_stats_dir;
auto const file = fmt::format("{}.{}.out", base_file, node);
auto const file_name = fmt::format("{}/{}", dir, file);

vt_debug_print(
lb, node,
"NodeStats: createStatsFile file={}\n", file_name
"NodeStats::createStatsFile: file={}\n", file_name
);

// Node 0 creates the directory
Expand All @@ -198,83 +206,91 @@ void NodeStats::createStatsFile() {
}

stats_file_ = fopen(file_name.c_str(), "w+");
vtAssertExpr(stats_file_ != nullptr);
}

void NodeStats::finalize() {
// If statistics are enabled, close output file and clear stats
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
closeStatsFile();
clearStats();
}
#endif
}

void NodeStats::closeStatsFile() {
if (stats_file_) {
fclose(stats_file_);
stats_file_ = nullptr;
stats_file_ = nullptr;
}
}

void NodeStats::outputStatsFile() {
if (stats_file_ == nullptr) {
createStatsFile();
std::pair<ElementIDType, ElementIDType>
getRecvSendDirection(CommKeyType const& comm) {
switch (comm.cat_) {
case CommCategory::SendRecv:
case CommCategory::Broadcast:
return std::make_pair(comm.toObj(), comm.fromObj());

case CommCategory::NodeToCollection:
case CommCategory::NodeToCollectionBcast:
return std::make_pair(comm.toObj(), comm.fromNode());

case CommCategory::CollectionToNode:
case CommCategory::CollectionToNodeBcast:
return std::make_pair(comm.toNode(), comm.fromObj());
}

vtAssertExpr(stats_file_ != nullptr);
vtAssert(false, "Invalid balance::CommCategory enum value");
return std::make_pair(ElementIDType{}, ElementIDType{});
}

auto const num_iters = node_data_.size();
void NodeStats::outputStatsForPhase(PhaseType phase) {
// Statistics output when LB is enabled and appropriate flag is enabled
if (!theConfig()->vt_lb_stats) {
return;
}

vt_print(lb, "NodeStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters);
vtAssertExpr(stats_file_ != nullptr);

for (size_t i = 0; i < num_iters; i++) {
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();
vt_print(lb, "NodeStats::outputStatsForPhase: phase={}\n", phase);

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}
for (auto&& elm : node_data_.at(phase)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(phase)[id];
size_t subphases = subphase_times.size();

obj_str += "]\n";
auto obj_str = fmt::format("{},{},{},{},[", phase, id, time, subphases);

fprintf(stats_file_, "%s", obj_str.c_str());
}
for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
for (size_t s = 0; s < subphases; s++) {
if (s > 0) {
obj_str += ",";
}

obj_str += std::to_string(subphase_times[s]);
}

obj_str += "]\n";

fprintf(stats_file_, "%s", obj_str.c_str());
}

for (auto&& elm : node_comm_.at(phase)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& comm = elm.first;
auto const recvSend = getRecvSendDirection(comm);
auto const cat = static_cast<E>(comm.cat_);
auto obj_str = fmt::format(
"{},{},{},{},{}\n", phase, recvSend.first, recvSend.second,
elm.second.bytes, cat
);
fprintf(stats_file_, "%s", obj_str.c_str());
}
fflush(stats_file_);

closeStatsFile();
fflush(stats_file_);
}

ElementIDType NodeStats::addNodeStats(
Expand Down
7 changes: 5 additions & 2 deletions src/vt/vrt/collection/balance/node_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
void releaseLB();

/**
* \internal \brief Output stats file based on instrumented data
* \internal \brief Output stats file for given phase based on instrumented data
*
* The contents of the file consist of a series of records separated
* by newlines. Each record consists of comma separated fields. The
Expand Down Expand Up @@ -161,7 +161,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
* recipient and distinguishing point-to-point messages from
* broadcasts, as a decimal integer.
*/
void outputStatsFile();
void outputStatsForPhase(PhaseType phase);

/**
* \internal \brief Generate the next object element ID for LB
Expand Down Expand Up @@ -237,6 +237,9 @@ struct NodeStats : runtime::component::Component<NodeStats> {
*/
VirtualProxyType getCollectionProxyForElement(ElementIDType temp_id) const;

void initialize() override;
void finalize() override;

private:
/**
* \internal \brief Create the stats file
Expand Down
8 changes: 0 additions & 8 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,6 @@ CollectionManager::CollectionManager() { }

void CollectionManager::finalize() {
cleanupAll<>();

// Statistics output when LB is enabled and appropriate flag is enabled
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsFile();
theNodeStats()->clearStats();
}
#endif
}

/*virtual*/ CollectionManager::~CollectionManager() { }
Expand Down
Loading

0 comments on commit 580823d

Please sign in to comment.