diff --git a/src/vt/configs/arguments/args.cc b/src/vt/configs/arguments/args.cc index b7268530cb..180505e1ff 100644 --- a/src/vt/configs/arguments/args.cc +++ b/src/vt/configs/arguments/args.cc @@ -80,6 +80,9 @@ namespace vt { namespace arguments { /*static*/ std::string ArgConfig::vt_lb_file_name = "balance.in"; /*static*/ std::string ArgConfig::vt_lb_name = "NoLB"; /*static*/ int32_t ArgConfig::vt_lb_interval = 1; +/*static*/ bool ArgConfig::vt_lb_stats = false; +/*static*/ std::string ArgConfig::vt_lb_stats_dir = "vt_lb_stats"; +/*static*/ std::string ArgConfig::vt_lb_stats_file = "stats"; /*static*/ bool ArgConfig::vt_no_detect_hang = false; /*static*/ int64_t ArgConfig::vt_hang_freq = 1024; @@ -324,25 +327,36 @@ namespace vt { namespace arguments { * Flags for enabling load balancing and configuring it */ - auto lb = "Enable load balancing"; - auto lb_file = "Enable reading LB configuration from file"; - auto lb_file_name = "LB configuration file to read"; - auto lb_name = "Name of the load balancer to use"; - auto lb_interval = "Load balancing interval"; + auto lb = "Enable load balancing"; + auto lb_file = "Enable reading LB configuration from file"; + auto lb_file_name = "LB configuration file to read"; + auto lb_name = "Name of the load balancer to use"; + auto lb_interval = "Load balancing interval"; + auto lb_stats = "Enable load balancing statistics"; + auto lb_stats_dir = "Load balancing statistics output directory"; + auto lb_stats_file = "Load balancing statistics output file name"; auto lbn = "NoLB"; auto lbi = 1; auto lbf = "balance.in"; - auto s = app.add_flag("--vt_lb", vt_lb, lb); - auto t = app.add_flag("--vt_lb_file", vt_lb_file, lb_file); - auto u = app.add_option("--vt_lb_file_name", vt_lb_file_name, lb_file_name, lbf); - auto v = app.add_option("--vt_lb_name", vt_lb_name, lb_name, lbn); - auto w = app.add_option("--vt_lb_interval", vt_lb_interval, lb_interval, lbi); + auto lbd = "vt_lb_stats"; + auto lbs = "stats"; + auto s = app.add_flag("--vt_lb", vt_lb, lb); + auto t = app.add_flag("--vt_lb_file", vt_lb_file, lb_file); + auto u = app.add_option("--vt_lb_file_name", vt_lb_file_name, lb_file_name, lbf); + auto v = app.add_option("--vt_lb_name", vt_lb_name, lb_name, lbn); + auto w = app.add_option("--vt_lb_interval", vt_lb_interval, lb_interval, lbi); + auto ww = app.add_flag("--vt_lb_stats", vt_lb_stats, lb_stats); + auto wx = app.add_option("--vt_lb_stats_dir", vt_lb_stats_dir, lb_stats_dir, lbd); + auto wy = app.add_option("--vt_lb_stats_file", vt_lb_stats_file, lb_stats_file,lbs); auto debugLB = "Load Balancing"; s->group(debugLB); t->group(debugLB); u->group(debugLB); v->group(debugLB); w->group(debugLB); + ww->group(debugLB); + wx->group(debugLB); + wy->group(debugLB); /* * Flags for controlling termination diff --git a/src/vt/configs/arguments/args.h b/src/vt/configs/arguments/args.h index ef8a7b0983..9bf6e6f4c8 100644 --- a/src/vt/configs/arguments/args.h +++ b/src/vt/configs/arguments/args.h @@ -83,6 +83,9 @@ struct ArgConfig { static std::string vt_lb_file_name; static std::string vt_lb_name; static int32_t vt_lb_interval; + static bool vt_lb_stats; + static std::string vt_lb_stats_dir; + static std::string vt_lb_stats_file; static bool vt_no_detect_hang; static int64_t vt_hang_freq; diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 90cb3d3634..49fc932d73 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -368,6 +368,10 @@ void Runtime::printStartupBanner() { auto f9 = warn_cr("--vt_lb", "lblite"); fmt::print("{}\t{}{}", vt_pre, f9, reset); } + if (ArgType::vt_lb_stats) { + auto f9 = warn_cr("--vt_lb_stats", "lblite"); + fmt::print("{}\t{}{}", vt_pre, f9, reset); + } #endif if (ArgType::vt_lb) { @@ -390,6 +394,26 @@ void Runtime::printStartupBanner() { } } + if (ArgType::vt_lb_stats) { + auto f9 = opt_on("--vt_lb_stats", "Load balancing statistics collection"); + fmt::print("{}\t{}{}", vt_pre, f9, reset); + + auto const fname = ArgType::vt_lb_stats_file; + if (fname != "") { + auto f11 = fmt::format("LB stats file name \"{}.0.out\"", fname); + auto f12 = opt_on("--vt_lb_stats_file", f11); + fmt::print("{}\t{}{}", vt_pre, f12, reset); + } + + auto const fdir = ArgType::vt_lb_stats_dir; + if (fdir != "") { + auto f11 = fmt::format("LB stats directory \"{}\"", fdir); + auto f12 = opt_on("--vt_lb_stats_dir", f11); + fmt::print("{}\t{}{}", vt_pre, f12, reset); + } + } + + #if !backend_check_enabled(trace_enabled) if (ArgType::vt_trace) { auto f9 = warn_cr("--vt_trace", "trace_enabled"); diff --git a/src/vt/runtime/runtime.h b/src/vt/runtime/runtime.h index d8c29d04bc..8db201e7f7 100644 --- a/src/vt/runtime/runtime.h +++ b/src/vt/runtime/runtime.h @@ -96,6 +96,8 @@ struct Runtime { RuntimeInstType getInstanceID() const { return instance_; } + void systemSync() { sync(); } + private: RuntimeInstType const instance_; diff --git a/src/vt/vrt/collection/balance/proc_stats.cc b/src/vt/vrt/collection/balance/proc_stats.cc index b2a6bdb366..9d4d77a454 100644 --- a/src/vt/vrt/collection/balance/proc_stats.cc +++ b/src/vt/vrt/collection/balance/proc_stats.cc @@ -46,10 +46,16 @@ #include "vt/vrt/collection/balance/proc_stats.h" #include "vt/vrt/collection/manager.h" #include "vt/timing/timing.h" +#include "vt/configs/arguments/args.h" +#include "vt/runtime/runtime.h" #include #include +#include +#include +#include +#include namespace vt { namespace vrt { namespace collection { namespace balance { @@ -64,6 +70,10 @@ std::unordered_map /*static*/ ProcStats::ElementIDType ProcStats::next_elm_ = 1; +/*static*/ FILE* ProcStats::stats_file_ = nullptr; + +/*static*/ bool ProcStats::created_dir_ = false; + /*static*/ void ProcStats::clearStats() { ProcStats::proc_data_.clear(); ProcStats::proc_migrate_.clear(); @@ -83,4 +93,61 @@ std::unordered_map CollectionManager::releaseLBPhase(msg.get()); } +/*static*/ void ProcStats::createStatsFile() { + using ArgType = vt::arguments::ArgConfig; + auto const node = theContext()->getNode(); + auto const base_file = std::string(ArgType::vt_lb_stats_file); + auto const dir = std::string(ArgType::vt_lb_stats_dir); + auto const file = fmt::format("{}.{}.out", base_file, node); + auto const file_name = fmt::format("{}/{}", dir, file); + + debug_print( + vrt_coll, node, + "ProcStats: createStatsFile file={}\n", file_name + ); + + // Node 0 creates the directory + if (not created_dir_ and node == 0) { + mkdir(dir.c_str(), S_IRWXU); + created_dir_ = true; + } + + // Barrier: wait for node 0 to create directory before trying to put a file in + // the stats destination directory + if (curRT) { + curRT->systemSync(); + } else { + // Something is wrong + vtAssert(false, "Trying to dump stats when VT runtime is deallocated?"); + } + + stats_file_ = fopen(file_name.c_str(), "w+"); +} + +/*static*/ void ProcStats::closeStatsFile() { + if (stats_file_) { + fclose(stats_file_); + stats_file_ = nullptr; + } +} + +/*static*/ void ProcStats::outputStatsFile() { + if (stats_file_ == nullptr) { + createStatsFile(); + } + + vtAssertExpr(stats_file_ != nullptr); + + auto const num_iters = ProcStats::proc_data_.size(); + for (auto i = 0; i < num_iters; i++) { + for (auto&& elm : ProcStats::proc_data_.at(i)) { + auto obj_str = fmt::format("{},{},{}\n", i, elm.first, elm.second); + fprintf(stats_file_, "%s", obj_str.c_str()); + } + } + fflush(stats_file_); + + closeStatsFile(); +} + }}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/proc_stats.h b/src/vt/vrt/collection/balance/proc_stats.h index 73b25c8872..f534070f20 100644 --- a/src/vt/vrt/collection/balance/proc_stats.h +++ b/src/vt/vrt/collection/balance/proc_stats.h @@ -54,6 +54,7 @@ #include #include #include +#include #include namespace vt { namespace vrt { namespace collection { namespace balance { @@ -72,6 +73,12 @@ struct ProcStats { static void clearStats(); static void releaseLB(); + static void outputStatsFile(); + +private: + static void createStatsFile(); + static void closeStatsFile(); + private: static ElementIDType getNextElm(); @@ -81,6 +88,9 @@ struct ProcStats { public: static std::vector> proc_data_; static std::unordered_map proc_migrate_; +private: + static FILE* stats_file_; + static bool created_dir_; }; }}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/proc_stats.impl.h b/src/vt/vrt/collection/balance/proc_stats.impl.h index b93eacf54e..74413585bb 100644 --- a/src/vt/vrt/collection/balance/proc_stats.impl.h +++ b/src/vt/vrt/collection/balance/proc_stats.impl.h @@ -60,7 +60,12 @@ template VirtualElmProxyType const& elm_proxy, ColT* col_elm, PhaseType const& phase, TimeType const& time ) { - auto const& next_elm = ProcStats::getNextElm(); + // Assign a new element ID if it's the first time this runs + if (col_elm->stats_elm_id_ == 0) { + col_elm->stats_elm_id_ = ProcStats::getNextElm(); + } + + auto const next_elm = col_elm->stats_elm_id_; debug_print( vrt_coll, node, @@ -77,14 +82,15 @@ template std::forward_as_tuple(time) ); auto migrate_iter = proc_migrate_.find(next_elm); - vtAssert(migrate_iter == proc_migrate_.end(), "Migrate func must not exist"); - proc_migrate_.emplace( - std::piecewise_construct, - std::forward_as_tuple(next_elm), - std::forward_as_tuple([elm_proxy,col_elm](NodeType node){ - col_elm->migrate(node); - }) - ); + if (migrate_iter == proc_migrate_.end()) { + proc_migrate_.emplace( + std::piecewise_construct, + std::forward_as_tuple(next_elm), + std::forward_as_tuple([elm_proxy,col_elm](NodeType node){ + col_elm->migrate(node); + }) + ); + } return next_elm; } diff --git a/src/vt/vrt/collection/holders/holder.h b/src/vt/vrt/collection/holders/holder.h index c1facfcdd4..c40e032377 100644 --- a/src/vt/vrt/collection/holders/holder.h +++ b/src/vt/vrt/collection/holders/holder.h @@ -91,6 +91,9 @@ struct Holder { void setGroupReady(bool const ready) { group_ready_ = ready; } NodeType groupRoot() const { return group_root_; } void setGroupRoot(NodeType const root) { group_root_ = root; } + CountType numReady() const { return elements_ready_; } + void addReady(CountType num = 1) { elements_ready_ += 1; } + void clearReady() { elements_ready_ = 0; } friend struct CollectionManager; @@ -106,6 +109,7 @@ struct Holder { bool group_ready_ = false; NodeType group_root_ = 0; CountType num_erased_not_removed_ = 0; + CountType elements_ready_ = 0; }; }}} /* end namespace vt::vrt::collection */ diff --git a/src/vt/vrt/collection/manager.h b/src/vt/vrt/collection/manager.h index 85614393a9..a80a02c9bc 100644 --- a/src/vt/vrt/collection/manager.h +++ b/src/vt/vrt/collection/manager.h @@ -71,6 +71,8 @@ #include "vt/collective/collective_alg.h" #include "vt/collective/reduce/reduce_msg.h" #include "vt/collective/reduce/reduce_hash.h" +#include "vt/configs/arguments/args.h" +#include "vt/vrt/collection/balance/proc_stats.h" #include #include @@ -113,6 +115,7 @@ struct CollectionManager { using CleanupListFnType = std::list; using DispatchHandlerType = auto_registry::AutoHandlerType; using ActionVecType = std::vector; + using ArgType = vt::arguments::ArgConfig; template using DistribConstructFn = std::function(IndexT idx)>; @@ -133,7 +136,19 @@ struct CollectionManager { CollectionManager() = default; - virtual ~CollectionManager() { cleanupAll<>(); } + virtual ~CollectionManager() { + cleanupAll<>(); + + // Statistics output when LB is enabled and appropriate flag is enabled + backend_enable_if( + lblite, { + if (ArgType::vt_lb_stats) { + balance::ProcStats::outputStatsFile(); + balance::ProcStats::clearStats(); + } + } + ); + } template void cleanupAll(); @@ -604,7 +619,15 @@ struct CollectionManager { ); /* - * LB-related operations on the collection + * ====================================================================== + * LB-related operations on the collection + * ====================================================================== + */ + + /* + * The `nextPhase` function is called by a single node on the whole collection + * to indicate that LB is ready. This includes all collections and thus may + * require extra sync to invoke safely */ template void nextPhase( @@ -643,11 +666,15 @@ struct CollectionManager { void checkReduceNoElements(); private: - template + template CollectionHolder* findColHolder(VirtualProxyType const& proxy); - template + + template Holder* findElmHolder(VirtualProxyType const& proxy); + template + Holder* findElmHolder(CollectionProxyWrapType proxy); + public: template void destroy(CollectionProxyWrapType const& proxy); diff --git a/src/vt/vrt/collection/manager.impl.h b/src/vt/vrt/collection/manager.impl.h index d8444cf8a7..a9a79a503b 100644 --- a/src/vt/vrt/collection/manager.impl.h +++ b/src/vt/vrt/collection/manager.impl.h @@ -2679,6 +2679,13 @@ Holder* CollectionManager::findElmHolder( } } +template +Holder* CollectionManager::findElmHolder( + CollectionProxyWrapType proxy +) { + return findElmHolder(proxy.getProxy()); +} + template std::size_t CollectionManager::numCollections() { return UniversalIndexHolder<>::getNumCollections(); diff --git a/src/vt/vrt/collection/types/migratable.h b/src/vt/vrt/collection/types/migratable.h index ffb4426845..35188540da 100644 --- a/src/vt/vrt/collection/types/migratable.h +++ b/src/vt/vrt/collection/types/migratable.h @@ -51,6 +51,7 @@ #include "vt/vrt/collection/types/migrate_hooks.h" #include "vt/vrt/collection/types/migratable.fwd.h" #include "vt/vrt/collection/balance/elm_stats.h" +#include "vt/vrt/collection/balance/proc_stats.h" namespace vt { namespace vrt { namespace collection { @@ -99,8 +100,10 @@ struct Migratable : MigrateHookBase { protected: friend struct balance::ElementStats; + friend struct balance::ProcStats; balance::ElementStats stats_; balance::ElementStats& getStats() { return stats_; } + balance::ProcStats::ElementIDType stats_elm_id_ = 0; }; }}} /* end namespace vt::vrt::collection */ diff --git a/src/vt/vrt/collection/types/migratable.impl.h b/src/vt/vrt/collection/types/migratable.impl.h index d79f2dd39b..35e5fa98af 100644 --- a/src/vt/vrt/collection/types/migratable.impl.h +++ b/src/vt/vrt/collection/types/migratable.impl.h @@ -55,6 +55,7 @@ template void Migratable::serialize(Serializer& s) { MigrateHookBase::serialize(s); s | stats_; + s | stats_elm_id_; } template