diff --git a/src/vt/configs/types/types_type.h b/src/vt/configs/types/types_type.h index 01ed8eab7b..dbfaf58640 100644 --- a/src/vt/configs/types/types_type.h +++ b/src/vt/configs/types/types_type.h @@ -106,8 +106,8 @@ using GroupType = uint64_t; using MsgSizeType = int32_t; /// Used for hold a phase for load balancing using PhaseType = uint64_t; -/// Used for hold a phase for load balancing -using SubphaseType = uint16_t; +/// Used for hold a subphase for load balancing +using SubphaseType = int16_t; /// Used for hold the identifier for a pipe (callbacks) using PipeType = uint64_t; /// Used for hold the proxy ID for an objgroup diff --git a/src/vt/vrt/collection/balance/elm_stats.cc b/src/vt/vrt/collection/balance/elm_stats.cc index ed1fcc3a1c..fd2941aef2 100644 --- a/src/vt/vrt/collection/balance/elm_stats.cc +++ b/src/vt/vrt/collection/balance/elm_stats.cc @@ -79,38 +79,31 @@ void ElementStats::stopTime() { ); } -void ElementStats::recvComm( - LBCommKey key, double bytes -) { - comm_.resize(cur_phase_ + 1); - comm_.at(cur_phase_)[key].receiveMsg(bytes); - subphase_comm_.resize(cur_phase_ + 1); - subphase_comm_.at(cur_phase_).resize(cur_subphase_ + 1); - subphase_comm_.at(cur_phase_).at(cur_subphase_)[key].receiveMsg(bytes); -} - void ElementStats::recvObjData( ElementIDType pto, ElementIDType tto, ElementIDType pfrom, ElementIDType tfrom, double bytes, bool bcast ) { + comm_.resize(cur_phase_ + 1); LBCommKey key(LBCommKey::CollectionTag{}, pfrom, tfrom, pto, tto, bcast); - recvComm(key, bytes); + comm_.at(cur_phase_)[key].receiveMsg(bytes); } void ElementStats::recvFromNode( ElementIDType pto, ElementIDType tto, NodeType from, double bytes, bool bcast ) { + comm_.resize(cur_phase_ + 1); LBCommKey key(LBCommKey::NodeToCollectionTag{}, from, pto, tto, bcast); - recvComm(key, bytes); + comm_.at(cur_phase_)[key].receiveMsg(bytes); } void ElementStats::recvToNode( NodeType to, ElementIDType pfrom, ElementIDType tfrom, double bytes, bool bcast ) { + comm_.resize(cur_phase_ + 1); LBCommKey key(LBCommKey::CollectionToNodeTag{}, pfrom, tfrom, to, bcast); - recvComm(key, bytes); + comm_.at(cur_phase_)[key].receiveMsg(bytes); } void ElementStats::setModelWeight(TimeType const& time) { @@ -202,25 +195,12 @@ ElementStats::getComm(PhaseType const& phase) { return phase_comm; } -std::vector const& ElementStats::getSubphaseComm(PhaseType phase) { - subphase_comm_.resize(phase + 1); - auto const& subphase_comm = subphase_comm_[phase]; - - vt_debug_print( - lb, node, - "ElementStats: getSubphaseComm: comm size={}, phase={}\n", - subphase_comm.size(), phase - ); - - return subphase_comm; -} - void ElementStats::setSubPhase(SubphaseType subphase) { vtAssert(subphase < no_subphase, "subphase must be less than sentinel"); cur_subphase_ = subphase; } -SubphaseType ElementStats::getSubPhase() const { +typename ElementStats::SubphaseType ElementStats::getSubPhase() const { return cur_subphase_; } @@ -230,7 +210,7 @@ void ElementStats::setFocusedSubPhase(VirtualProxyType collection, SubphaseType } /*static*/ -SubphaseType ElementStats::getFocusedSubPhase(VirtualProxyType collection) { +ElementStats::SubphaseType ElementStats::getFocusedSubPhase(VirtualProxyType collection) { auto i = focused_subphase_.find(collection); if (i != focused_subphase_.end()) return i->second; @@ -238,6 +218,6 @@ SubphaseType ElementStats::getFocusedSubPhase(VirtualProxyType collection) { return no_subphase; } -/*static*/ std::unordered_map ElementStats::focused_subphase_; +/*static*/ std::unordered_map ElementStats::focused_subphase_; }}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/elm_stats.h b/src/vt/vrt/collection/balance/elm_stats.h index f42185a5da..d139ab0aed 100644 --- a/src/vt/vrt/collection/balance/elm_stats.h +++ b/src/vt/vrt/collection/balance/elm_stats.h @@ -62,6 +62,9 @@ namespace vt { namespace vrt { namespace collection { namespace balance { struct ElementStats { + using PhaseType = uint64_t; + using SubphaseType = uint16_t; + ElementStats() = default; ElementStats(ElementStats const&) = default; ElementStats(ElementStats&&) = default; @@ -69,7 +72,6 @@ struct ElementStats { void startTime(); void stopTime(); void addTime(TimeType const& time); - void recvComm(LBCommKey key, double bytes); void recvObjData( ElementIDType to_perm, ElementIDType to_temp, ElementIDType from_perm, ElementIDType from_temp, double bytes, bool bcast @@ -89,7 +91,6 @@ struct ElementStats { TimeType getLoad(PhaseType phase, SubphaseType subphase) const; CommMapType const& getComm(PhaseType const& phase); - std::vector const& getSubphaseComm(PhaseType phase); void setSubPhase(SubphaseType subphase); SubphaseType getSubPhase() const; @@ -115,7 +116,6 @@ struct ElementStats { SubphaseType cur_subphase_ = 0; std::vector> subphase_timings_ = {}; - std::vector> subphase_comm_ = {}; static std::unordered_map focused_subphase_; }; diff --git a/src/vt/vrt/collection/balance/elm_stats.impl.h b/src/vt/vrt/collection/balance/elm_stats.impl.h index 178fc54744..a51fad944a 100644 --- a/src/vt/vrt/collection/balance/elm_stats.impl.h +++ b/src/vt/vrt/collection/balance/elm_stats.impl.h @@ -90,11 +90,10 @@ template auto const& total_load = stats.getLoad(cur_phase, getFocusedSubPhase(untyped_proxy)); auto const& subphase_loads = stats.subphase_timings_.at(cur_phase); auto const& comm = stats.getComm(cur_phase); - auto const& subphase_comm = stats.getSubphaseComm(cur_phase); auto const& idx = col->getIndex(); auto const& elm_proxy = proxy[idx]; - theNodeStats()->addNodeStats(col, cur_phase, total_load, subphase_loads, comm, subphase_comm); + theNodeStats()->addNodeStats(col, cur_phase, total_load, subphase_loads, comm); auto const before_ready = theCollection()->numReadyCollections(); theCollection()->makeCollectionReady(untyped_proxy); @@ -107,6 +106,8 @@ template before_ready, after_ready, ready ); + using MsgType = InvokeReduceMsg; + auto lb_man = theLBManager()->getProxy(); auto const single_node = theContext()->getNumNodes() == 1; @@ -114,10 +115,10 @@ template bool const must_run_lb = lb != LBType::NoLB and not single_node; auto const num_collections = theCollection()->numCollections<>(); auto const do_sync = msg->doSync(); - auto nmsg = makeMessage(cur_phase,lb,msg->manual(),num_collections); + auto nmsg = makeMessage(cur_phase,lb,msg->manual(),num_collections); if (must_run_lb) { - auto cb = theCB()->makeBcast(lb_man); + auto cb = theCB()->makeBcast>(lb_man); proxy.reduce(nmsg.get(),cb); } else { @@ -128,7 +129,7 @@ template theCollection()->elmFinishedLB(elm_proxy,cur_phase); } - auto cb = theCB()->makeBcast(lb_man); + auto cb = theCB()->makeBcast>(lb_man); proxy.reduce(nmsg.get(),cb); } } diff --git a/src/vt/vrt/collection/balance/node_stats.cc b/src/vt/vrt/collection/balance/node_stats.cc index 505c01c166..2d007d452f 100644 --- a/src/vt/vrt/collection/balance/node_stats.cc +++ b/src/vt/vrt/collection/balance/node_stats.cc @@ -117,10 +117,6 @@ std::unordered_map const* NodeStats::getNodeComm() const return &node_comm_; } -std::unordered_map> const* NodeStats::getNodeSubphaseComm() const { - return &node_subphase_comm_; -} - void NodeStats::clearStats() { NodeStats::node_comm_.clear(); NodeStats::node_data_.clear(); @@ -131,7 +127,7 @@ void NodeStats::clearStats() { next_elm_ = 1; } -void NodeStats::startIterCleanup(PhaseType phase, unsigned int look_back) { +void NodeStats::startIterCleanup(PhaseType phase, int look_back) { // TODO: Add in subphase support here too // Convert the temp ID node_data_ for the last iteration into perm ID for @@ -146,7 +142,7 @@ void NodeStats::startIterCleanup(PhaseType phase, unsigned int look_back) { } node_data_[phase] = std::move(new_data); - if (phase >= look_back) { + if (phase - look_back >= 0) { node_data_.erase(phase - look_back); node_subphase_data_.erase(phase - look_back); node_comm_.erase(phase - look_back); @@ -174,24 +170,16 @@ void NodeStats::releaseLB() { CollectionManager::releaseLBPhase(msg_hold.get()); } -void NodeStats::initialize() { -#if vt_check_enabled(lblite) - if (theConfig()->vt_lb_stats) { - theNodeStats()->createStatsFile(); - } -#endif -} - void NodeStats::createStatsFile() { auto const node = theContext()->getNode(); - auto const base_file = theConfig()->vt_lb_stats_file; - auto const dir = theConfig()->vt_lb_stats_dir; + auto const base_file = std::string(theConfig()->vt_lb_stats_file); + auto const dir = std::string(theConfig()->vt_lb_stats_dir); auto const file = fmt::format("{}.{}.out", base_file, node); auto const file_name = fmt::format("{}/{}", dir, file); vt_debug_print( lb, node, - "NodeStats::createStatsFile: file={}\n", file_name + "NodeStats: createStatsFile file={}\n", file_name ); // Node 0 creates the directory @@ -210,98 +198,89 @@ void NodeStats::createStatsFile() { } stats_file_ = fopen(file_name.c_str(), "w+"); - vtAssertExpr(stats_file_ != nullptr); -} - -void NodeStats::finalize() { - // If statistics are enabled, close output file and clear stats -#if vt_check_enabled(lblite) - if (theConfig()->vt_lb_stats) { - closeStatsFile(); - clearStats(); - } -#endif } void NodeStats::closeStatsFile() { if (stats_file_) { fclose(stats_file_); - stats_file_ = nullptr; + stats_file_ = nullptr; } } -std::pair -getRecvSendDirection(CommKeyType const& comm) { - switch (comm.cat_) { - case CommCategory::SendRecv: - case CommCategory::Broadcast: - return std::make_pair(comm.toObj(), comm.fromObj()); - - case CommCategory::NodeToCollection: - case CommCategory::NodeToCollectionBcast: - return std::make_pair(comm.toObj(), comm.fromNode()); - - case CommCategory::CollectionToNode: - case CommCategory::CollectionToNodeBcast: - return std::make_pair(comm.toNode(), comm.fromObj()); - } - - vtAssert(false, "Invalid balance::CommCategory enum value"); - return std::make_pair(ElementIDType{}, ElementIDType{}); -} - -void NodeStats::outputStatsForPhase(PhaseType phase) { - // Statistics output when LB is enabled and appropriate flag is enabled - if (!theConfig()->vt_lb_stats) { - return; +void NodeStats::outputStatsFile() { + if (stats_file_ == nullptr) { + createStatsFile(); } vtAssertExpr(stats_file_ != nullptr); - vt_print(lb, "NodeStats::outputStatsForPhase: phase={}\n", phase); + auto const num_iters = node_data_.size(); - for (auto&& elm : node_data_.at(phase)) { - ElementIDType id = elm.first; - TimeType time = elm.second; - const auto& subphase_times = node_subphase_data_.at(phase)[id]; - size_t subphases = subphase_times.size(); + vt_print(lb, "NodeStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters); - auto obj_str = fmt::format("{},{},{},{},[", phase, id, time, subphases); + for (size_t i = 0; i < num_iters; i++) { + for (auto&& elm : node_data_.at(i)) { + ElementIDType id = elm.first; + TimeType time = elm.second; + const auto& subphase_times = node_subphase_data_.at(i)[id]; + size_t subphases = subphase_times.size(); - for (size_t s = 0; s < subphases; s++) { - if (s > 0) { - obj_str += ","; + auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases); + for (size_t s = 0; s < subphases; s++) { + obj_str += std::to_string(subphase_times[s]); + if (s != subphases - 1) + obj_str += ","; } - obj_str += std::to_string(subphase_times[s]); - } - - obj_str += "]\n"; - - fprintf(stats_file_, "%s", obj_str.c_str()); - } + obj_str += "]\n"; - for (auto&& elm : node_comm_.at(phase)) { - using E = typename std::underlying_type::type; - - auto const& comm = elm.first; - auto const recvSend = getRecvSendDirection(comm); - auto const cat = static_cast(comm.cat_); - auto obj_str = fmt::format( - "{},{},{},{},{}\n", phase, recvSend.first, recvSend.second, - elm.second.bytes, cat - ); - fprintf(stats_file_, "%s", obj_str.c_str()); + fprintf(stats_file_, "%s", obj_str.c_str()); + } + for (auto&& elm : node_comm_.at(i)) { + using E = typename std::underlying_type::type; + + auto const& key = elm.first; + auto const& val = elm.second; + auto const cat = static_cast(key.cat_); + + if ( + key.cat_ == CommCategory::SendRecv or + key.cat_ == CommCategory::Broadcast + ) { + auto const to = key.toObj(); + auto const from = key.fromObj(); + auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat); + fprintf(stats_file_, "%s", obj_str.c_str()); + } else if ( + key.cat_ == CommCategory::NodeToCollection or + key.cat_ == CommCategory::NodeToCollectionBcast + ) { + auto const to = key.toObj(); + auto const from = key.fromNode(); + auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat); + fprintf(stats_file_, "%s", obj_str.c_str()); + } else if ( + key.cat_ == CommCategory::CollectionToNode or + key.cat_ == CommCategory::CollectionToNodeBcast + ) { + auto const to = key.toNode(); + auto const from = key.fromObj(); + auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat); + fprintf(stats_file_, "%s", obj_str.c_str()); + } else { + vtAssert(false, "Invalid balance::CommCategory enum value"); + } + } } - fflush(stats_file_); + + closeStatsFile(); } ElementIDType NodeStats::addNodeStats( Migratable* col_elm, PhaseType const& phase, TimeType const& time, - std::vector const& subphase_time, - CommMapType const& comm, std::vector const& subphase_comm + std::vector const& subphase_time, CommMapType const& comm ) { // A new temp ID gets assigned when a object is migrated into a node @@ -337,13 +316,6 @@ ElementIDType NodeStats::addNodeStats( comm_data[c.first] += c.second; } - auto &subphase_comm_data = node_subphase_comm_[phase]; - for (SubphaseType i = 0; i < subphase_comm.size(); i++) { - for (auto& sp : subphase_comm[i]) { - subphase_comm_data[i][sp.first] += sp.second; - } - } - node_temp_to_perm_[temp_id] = perm_id; node_perm_to_temp_[perm_id] = temp_id; diff --git a/src/vt/vrt/collection/balance/node_stats.h b/src/vt/vrt/collection/balance/node_stats.h index 188b84b759..a4a697871d 100644 --- a/src/vt/vrt/collection/balance/node_stats.h +++ b/src/vt/vrt/collection/balance/node_stats.h @@ -108,14 +108,13 @@ struct NodeStats : runtime::component::Component { * \param[in] phase the current phase * \param[in] time the time the object took * \param[in] comm the comm graph for the object - eric * + * * \return the temporary ID for the object assigned for this phase */ ElementIDType addNodeStats( Migratable* col_elm, PhaseType const& phase, TimeType const& time, - std::vector const& subphase_time, - CommMapType const& comm, std::vector const& subphase_comm + std::vector const& subphase_time, CommMapType const& comm ); /** @@ -126,7 +125,7 @@ struct NodeStats : runtime::component::Component { /** * \internal \brief Cleanup after LB runs; convert temporary to permanent IDs */ - void startIterCleanup(PhaseType phase, unsigned int look_back); + void startIterCleanup(PhaseType phase, int look_back); /** * \internal \brief Release collection after LB runs for this phase @@ -134,7 +133,7 @@ struct NodeStats : runtime::component::Component { void releaseLB(); /** - * \internal \brief Output stats file for given phase based on instrumented data + * \internal \brief Output stats file based on instrumented data * * The contents of the file consist of a series of records separated * by newlines. Each record consists of comma separated fields. The @@ -162,7 +161,7 @@ struct NodeStats : runtime::component::Component { * recipient and distinguishing point-to-point messages from * broadcasts, as a decimal integer. */ - void outputStatsForPhase(PhaseType phase); + void outputStatsFile(); /** * \internal \brief Generate the next object element ID for LB @@ -190,13 +189,6 @@ struct NodeStats : runtime::component::Component { */ std::unordered_map const* getNodeComm() const; - /** - * \internal \brief Get stored object comm subphase graph - * - * \return an observer pointer to the comm subphase graph - */ - std::unordered_map> const* getNodeSubphaseComm() const; - /** * \internal \brief Test if this node has an object to migrate * @@ -245,9 +237,6 @@ struct NodeStats : runtime::component::Component { */ VirtualProxyType getCollectionProxyForElement(ElementIDType temp_id) const; - void initialize() override; - void finalize() override; - private: /** * \internal \brief Create the stats file @@ -276,8 +265,6 @@ struct NodeStats : runtime::component::Component { std::unordered_map node_collection_lookup_; /// Node communication graph for each local object std::unordered_map node_comm_; - /// Node communication graph for each subphase - std::unordered_map> node_subphase_comm_; /// The current element ID ElementIDType next_elm_; /// The stats file name for outputting instrumentation diff --git a/src/vt/vrt/proxy/collection_proxy.h b/src/vt/vrt/proxy/collection_proxy.h index 532a0b8b3a..5028f37764 100644 --- a/src/vt/vrt/proxy/collection_proxy.h +++ b/src/vt/vrt/proxy/collection_proxy.h @@ -115,7 +115,7 @@ struct CollectionProxy : ProxyCollectionTraits { * * This must be called on every process */ - void setFocusedSubPhase(SubphaseType subphase); + void setFocusedSubPhase(balance::ElementStats::SubphaseType subphase); }; }}} /* end namespace vt::vrt::collection */