From 206b0ca3f16682d332a0abd70a13d904c6b65e54 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Thu, 24 Feb 2022 14:19:31 -0800 Subject: [PATCH 01/41] #1265: config: add debugging for replay --- src/vt/configs/arguments/app_config.h | 2 ++ src/vt/configs/arguments/args.cc | 3 +++ src/vt/configs/debug/debug_config.h | 4 +++- src/vt/runtime/runtime_banner.cc | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/vt/configs/arguments/app_config.h b/src/vt/configs/arguments/app_config.h index 5572500a5e..bdabfed535 100644 --- a/src/vt/configs/arguments/app_config.h +++ b/src/vt/configs/arguments/app_config.h @@ -225,6 +225,7 @@ struct AppConfig { bool vt_debug_phase = false; bool vt_debug_context = false; bool vt_debug_epoch = false; + bool vt_debug_replay = false; bool vt_debug_print_flush = false; @@ -386,6 +387,7 @@ struct AppConfig { | vt_debug_phase | vt_debug_context | vt_debug_epoch + | vt_debug_replay | vt_debug_print_flush diff --git a/src/vt/configs/arguments/args.cc b/src/vt/configs/arguments/args.cc index 6cb0a03345..628f4fd88f 100644 --- a/src/vt/configs/arguments/args.cc +++ b/src/vt/configs/arguments/args.cc @@ -374,6 +374,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { auto dcp = "Enable debug_phase = \"" debug_pp(phase) "\""; auto ddp = "Enable debug_context = \"" debug_pp(context) "\""; auto dep = "Enable debug_epoch = \"" debug_pp(epoch) "\""; + auto dfp = "Enable debug_replay = \"" debug_pp(replay) "\""; auto r1 = app.add_option("--vt_debug_level", appConfig.vt_debug_level, rq); @@ -410,6 +411,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { auto dc = app.add_flag("--vt_debug_phase", appConfig.vt_debug_phase, dcp); auto dd = app.add_flag("--vt_debug_context", appConfig.vt_debug_context, ddp); auto de = app.add_flag("--vt_debug_epoch", appConfig.vt_debug_epoch, dep); + auto df = app.add_flag("--vt_debug_replay", appConfig.vt_debug_replay, dfp); auto debugGroup = "Debug Print Configuration (must be compile-time enabled)"; r->group(debugGroup); @@ -446,6 +448,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) { dc->group(debugGroup); dd->group(debugGroup); de->group(debugGroup); + df->group(debugGroup); auto dbq = "Always flush VT runtime prints"; auto eb = app.add_flag("--vt_debug_print_flush", appConfig.vt_debug_print_flush, dbq); diff --git a/src/vt/configs/debug/debug_config.h b/src/vt/configs/debug/debug_config.h index 999586b1fc..bde85b0620 100644 --- a/src/vt/configs/debug/debug_config.h +++ b/src/vt/configs/debug/debug_config.h @@ -80,7 +80,8 @@ enum CatEnum : uint64_t { phase = 1ull<<28, context = 1ull<<29, epoch = 1ull<<30, - temperedwmin = 1ull<<31 + temperedwmin = 1ull<<31, + replay = 1ull<<32 }; enum CtxEnum : uint64_t { @@ -138,6 +139,7 @@ vt_option_category_pretty_print(reduce, "reduce") vt_option_category_pretty_print(rdma, "RDMA") vt_option_category_pretty_print(rdma_channel, "RDMA Channel") vt_option_category_pretty_print(rdma_state, "RDMA State") +vt_option_category_pretty_print(replay, "replay") vt_option_category_pretty_print(runtime, "runtime") vt_option_category_pretty_print(scatter, "scatter") vt_option_category_pretty_print(serial_msg, "serialized-msg") diff --git a/src/vt/runtime/runtime_banner.cc b/src/vt/runtime/runtime_banner.cc index ee02c32387..260417fd86 100644 --- a/src/vt/runtime/runtime_banner.cc +++ b/src/vt/runtime/runtime_banner.cc @@ -904,6 +904,7 @@ void Runtime::printStartupBanner() { vt_runtime_debug_warn_compile(phase) vt_runtime_debug_warn_compile(context) vt_runtime_debug_warn_compile(epoch) + vt_runtime_debug_warn_compile(replay) auto arg_str = [](std::vector const& args) -> std::string { std::stringstream ss; From 77c3b7c2e9673aa4aede8818616caf0aa0b6e2f0 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Thu, 24 Feb 2022 14:43:22 -0800 Subject: [PATCH 02/41] #1265: replay: add collectionless replay capability --- examples/collection/CMakeLists.txt | 1 + examples/collection/stats_replay_driver.cc | 70 ++++ src/vt/vrt/collection/balance/stats_replay.cc | 354 ++++++++++++++++++ src/vt/vrt/collection/balance/stats_replay.h | 102 +++++ 4 files changed, 527 insertions(+) create mode 100644 examples/collection/stats_replay_driver.cc create mode 100644 src/vt/vrt/collection/balance/stats_replay.cc create mode 100644 src/vt/vrt/collection/balance/stats_replay.h diff --git a/examples/collection/CMakeLists.txt b/examples/collection/CMakeLists.txt index edbd11d64e..e9eaf14fbb 100644 --- a/examples/collection/CMakeLists.txt +++ b/examples/collection/CMakeLists.txt @@ -9,6 +9,7 @@ set( insertable_collection reduce_integral transpose + stats_replay_driver ) foreach(EXAMPLE_NAME ${COLLECTION_EXAMPLES}) diff --git a/examples/collection/stats_replay_driver.cc b/examples/collection/stats_replay_driver.cc new file mode 100644 index 0000000000..398673a172 --- /dev/null +++ b/examples/collection/stats_replay_driver.cc @@ -0,0 +1,70 @@ +/* +//@HEADER +// ***************************************************************************** +// +// stats_replay_driver.cc +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include +#include + +int main(int argc, char** argv) { + using vt::PhaseType; + + vt::initialize(argc, argv); + + vtAbortIf( + argc != 3, + "Must have two arguments: " + ); + + // initial phase to simulate + PhaseType initial_phase = atoi(argv[1]); + // number of phases to simulate + PhaseType phases_to_run = atoi(argv[2]); + + vt::vrt::collection::balance::replayFromInputStats( + initial_phase, phases_to_run + ); + + vt::finalize(); + + return 0; +} diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc new file mode 100644 index 0000000000..1fd8c9b1c6 --- /dev/null +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -0,0 +1,354 @@ +/* +//@HEADER +// ***************************************************************************** +// +// stats_replay.cc +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include "vt/config.h" +#include "vt/vrt/collection/balance/stats_replay.h" +#include "vt/vrt/collection/balance/stats_data.h" +#include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" +#include "vt/utils/json/json_reader.h" + +#include + +#include + +namespace vt { namespace vrt { namespace collection { +namespace balance { + +void replayFromInputStats( + PhaseType initial_phase, PhaseType phases_to_run +) { + using util::json::Reader; + using ObjIDType = elm::ElementIDStruct; + + // read in object loads from json files + auto const filename = theConfig()->getLBStatsFileIn(); + Reader r{filename}; + auto json = r.readFile(); + auto sd = StatsData(*json); + + // remember vt's base load model + auto base_load_model = theLBManager()->getBaseLoadModel(); + + // allow remembering the migrations suggested by the load balancer + std::shared_ptr lb_reassignment = nullptr; + + // allow remembering what objects are here after the load balancer migrates + std::set objects_here; + + // simulate the requested number of phases + auto const this_rank = theContext()->getNode(); + auto stop_phase = initial_phase + phases_to_run; + for (PhaseType phase = initial_phase; phase < stop_phase; phase++) { + if (this_rank == 0) + vt_print(replay, "Simulated phase {}...\n", phase); + + // reapply the base load model if in case we overwrote it on a previous iter + theLBManager()->setLoadModel(base_load_model); + + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + + // point the load model at the stats for the relevant phase + runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { + base_load_model->updateLoads(phase); + }); + + size_t count = 0; + for (auto stat_obj_id : *base_load_model) { + if (stat_obj_id.isMigratable()) { + ++count; + vt_debug_print( + normal, replay, + "stats for id {} are here on phase {}\n", + stat_obj_id, phase + ); + } + } + // sanity output + vt_debug_print( + terse, replay, + "Stats num objects: {}\n", count + ); + + auto pre_lb_load_model = base_load_model; + + // if this isn't the initial phase, then the stats may exist on a rank + // other than where the objects are currently meant to exist; we will + // use a Reassignment object to get those load stats where they need to be + if (phase > initial_phase) { + // at the beginning of this phase, objects will exist in the locations + // they were placed by the previous lb invocation; this will be the + // arriving node for the purposes of this load model; that location + // is known by both the rank at which the lb placed the object and the + // rank from which the lb removed the object; the curr_node member of + // the object ids in the lb_reassignment object refers to the pre-lb + // location on the previous phase, but the curr_node member for our new + // load model must point to where the stats data exists for this phase + + // the stats data for this phase can exist at arbitrary locations; the + // only rank to know the location of this data is the one that has it; + // this will be the departing node for the purposes of this load model; + // we need to make sure the curr_node member of the object ids in our + // new load model points to the node on which the stats data lives + + runInEpochCollective("StatsReplayDriver -> migrateStatsDataHome", [&] { + auto norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + auto normalizer = norm_lb_proxy.get(); + pre_lb_load_model = normalizer->createStatsAtHomeModel( + base_load_model, objects_here + ); + norm_lb_proxy.destroyCollective(); + }); + theLBManager()->setLoadModel(pre_lb_load_model); + pre_lb_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + + runInEpochCollective("StatsReplayDriver -> migrateStatsDataHere", [&] { + auto norm_lb_proxy = LBStatsMigrator::construct(pre_lb_load_model); + auto normalizer = norm_lb_proxy.get(); + pre_lb_load_model = normalizer->createStatsHereModel( + pre_lb_load_model, objects_here + ); + norm_lb_proxy.destroyCollective(); + }); + theLBManager()->setLoadModel(pre_lb_load_model); + pre_lb_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + } + + // sanity output + count = 0; + for (auto stat_obj_id : *pre_lb_load_model) { + if (stat_obj_id.isMigratable()) { + ++count; + vt_debug_print( + normal, replay, + "element {} is here on phase {} pre-lb\n", + stat_obj_id, phase + ); + } + } + vt_debug_print( + terse, replay, + "Pre-lb num objects: {}\n", count + ); + + vt_debug_print( + terse, replay, + "constructing load model from real load balancer\n" + ); + + runInEpochCollective("StatsReplayDriver -> runRealLB", [&] { + // run the load balancer but don't let it automatically migrate; + // instead, remember where the LB wanted to migrate objects + lb_reassignment = theLBManager()->selectStartLB(phase); + + auto proposed_model = std::make_shared( + pre_lb_load_model, lb_reassignment + ); + objects_here.clear(); + for (auto it = proposed_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + ObjIDType loc_id = *it; + loc_id.curr_node = this_rank; + objects_here.insert(loc_id); + vt_debug_print( + normal, replay, + "element {} is here on phase {} post-lb\n", + loc_id, phase + ); + } + } + vt_debug_print( + terse, replay, + "Post-lb num objects: {}\n", objects_here.size() + ); + }); + runInEpochCollective("StatsReplayDriver -> destroyLB", [&] { + theLBManager()->destroyLB(); + }); + theCollective()->barrier(); + } +} + + +/*static*/ +objgroup::proxy::Proxy +LBStatsMigrator::construct(std::shared_ptr model_base) { + auto my_proxy = theObjGroup()->makeCollective(); + auto strat = my_proxy.get(); + strat->init(my_proxy); + auto base_proxy = my_proxy.template registerBaseCollective(); + vt_debug_print( + verbose, replay, + "LBStatsMigrator proxy={} base_proxy={}\n", + my_proxy.getProxy(), base_proxy.getProxy() + ); + strat->proxy_ = base_proxy; + strat->load_model_ = model_base.get(); + return my_proxy; +} + +void LBStatsMigrator::init(objgroup::proxy::Proxy in_proxy) { + proxy = in_proxy; +} + +void LBStatsMigrator::runLB(TimeType) { } + +void LBStatsMigrator::inputParams(SpecEntry* spec) { } + +std::unordered_map +LBStatsMigrator::getInputKeysWithHelp() { + std::unordered_map const keys_help; + return keys_help; +} + +std::shared_ptr +LBStatsMigrator::createStatsAtHomeModel( + std::shared_ptr model_base, + std::set objects_here +) { + auto const this_rank = vt::theContext()->getNode(); + vt_debug_print( + terse, replay, + "constructing load model to get loads from file location to home\n" + ); + + runInEpochCollective("LBStatsMigrator -> transferStatsHome", [&] { + for (auto stat_obj_id : *model_base) { + if (stat_obj_id.isMigratable()) { + // if the object belongs here, do nothing; otherwise, "transfer" it to + // the home rank + if (stat_obj_id.getHomeNode() != this_rank) { + if (objects_here.count(stat_obj_id) == 0) { + vt_debug_print( + verbose, replay, + "will transfer load of {} home to {}\n", + stat_obj_id, stat_obj_id.getHomeNode() + ); + migrateObjectTo(stat_obj_id, stat_obj_id.getHomeNode()); + } + } + } + } + }); + + auto tmp_assignment = normalizeReassignments(); + auto home_assignment = std::make_shared(); + home_assignment->node_ = tmp_assignment->node_; + home_assignment->global_migration_count = tmp_assignment->global_migration_count; + for (auto &dep : tmp_assignment->depart_) { + ObjIDType id = dep.first; + NodeType dest = dep.second; + id.curr_node = dest; + home_assignment->depart_[id] = dest; + } + for (auto &arr : tmp_assignment->arrive_) { + ObjIDType id = arr.first; + id.curr_node = this_rank; + home_assignment->arrive_[id] = arr.second; + } + return std::make_shared(model_base, home_assignment); +} + +std::shared_ptr +LBStatsMigrator::createStatsHereModel( + std::shared_ptr model_base, + std::set objects_here +) { + auto const this_rank = vt::theContext()->getNode(); + vt_debug_print( + terse, replay, + "constructing load model to get loads from home to here\n" + ); + + runInEpochCollective("LBStatsMigrator -> transferStatsHere", [&] { + for (auto stat_obj_id : objects_here) { + if (stat_obj_id.isMigratable()) { + // if the object is already here, do nothing; otherwise, "transfer" it + // from the home rank + bool stats_here = false; + for (auto other_id : *model_base) { + if (stat_obj_id == other_id) { + stats_here = true; + break; + } + } + if (!stats_here) { + // check that this isn't something that should already have been here + assert(stat_obj_id.getHomeNode() != this_rank); + + vt_debug_print( + verbose, replay, + "will transfer load of {} from home {}\n", + stat_obj_id, stat_obj_id.getHomeNode() + ); + ObjIDType mod_id = stat_obj_id; + // Override curr_node to force retrieval from the home rank + mod_id.curr_node = stat_obj_id.getHomeNode(); + migrateObjectTo(mod_id, this_rank); + } + } + } + }); + + auto tmp_assignment = normalizeReassignments(); + + // now restore the curr_node values to reflect the placement of the "real" object + auto here_assignment = std::make_shared(); + here_assignment->node_ = tmp_assignment->node_; + here_assignment->global_migration_count = tmp_assignment->global_migration_count; + for (auto &dep : tmp_assignment->depart_) { + ObjIDType id = dep.first; + NodeType dest = dep.second; + id.curr_node = dest; + here_assignment->depart_[id] = dest; + } + for (auto &arr : tmp_assignment->arrive_) { + ObjIDType id = arr.first; + id.curr_node = this_rank; + here_assignment->arrive_[id] = arr.second; + } + return std::make_shared(model_base, here_assignment); +} + +}}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/stats_replay.h new file mode 100644 index 0000000000..bf313b79e8 --- /dev/null +++ b/src/vt/vrt/collection/balance/stats_replay.h @@ -0,0 +1,102 @@ +/* +//@HEADER +// ***************************************************************************** +// +// stats_replay.h +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H +#define INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H + +#include "vt/config.h" +#include "vt/elm/elm_id.h" +#include "vt/vrt/collection/balance/baselb/baselb.h" +#include "vt/vrt/collection/balance/model/load_model.h" +#include "vt/vrt/collection/balance/model/proposed_reassignment.h" + +#include +#include +#include + +namespace vt { namespace vrt { namespace collection { +namespace balance { + +void replayFromInputStats( + PhaseType initial_phase, PhaseType phases_to_run +); + +struct LBStatsMigrator : lb::BaseLB { + + using ObjIDType = elm::ElementIDStruct; + + LBStatsMigrator() = default; + + static objgroup::proxy::Proxy + construct(std::shared_ptr model_base); + + void init(objgroup::proxy::Proxy in_proxy); + + void runLB(TimeType) override; + + void inputParams(SpecEntry* spec) override; + + static std::unordered_map getInputKeysWithHelp(); + + using BaseLB::normalizeReassignments; + + std::shared_ptr + createStatsAtHomeModel( + std::shared_ptr model_base, + std::set objects_here + ); + + std::shared_ptr + createStatsHereModel( + std::shared_ptr model_base, + std::set objects_here + ); + +private: + objgroup::proxy::Proxy proxy = {}; +}; + +}}}} /* end namespace vt::vrt::collection::balance */ + +#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H*/ From f5a58cf9a5f7ced85c4f2a397b75ceb8264b6534 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 7 Mar 2022 13:38:07 -0800 Subject: [PATCH 03/41] #1265: replay: make output more user-friendly --- src/vt/vrt/collection/balance/stats_replay.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index 1fd8c9b1c6..9dc15269ab 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -80,9 +80,6 @@ void replayFromInputStats( auto const this_rank = theContext()->getNode(); auto stop_phase = initial_phase + phases_to_run; for (PhaseType phase = initial_phase; phase < stop_phase; phase++) { - if (this_rank == 0) - vt_print(replay, "Simulated phase {}...\n", phase); - // reapply the base load model if in case we overwrote it on a previous iter theLBManager()->setLoadModel(base_load_model); @@ -117,6 +114,14 @@ void replayFromInputStats( // other than where the objects are currently meant to exist; we will // use a Reassignment object to get those load stats where they need to be if (phase > initial_phase) { + if (this_rank == 0) { + vt_print( + replay, + "Migrating imported object stats to phase {} ranks...\n", + phase + ); + } + // at the beginning of this phase, objects will exist in the locations // they were placed by the previous lb invocation; this will be the // arriving node for the purposes of this load model; that location @@ -155,6 +160,10 @@ void replayFromInputStats( pre_lb_load_model->setLoads(&sd.node_data_, &sd.node_comm_); } + if (this_rank == 0) { + vt_print(replay, "Simulating phase {}...\n", phase); + } + // sanity output count = 0; for (auto stat_obj_id : *pre_lb_load_model) { From 9cb6cd080e87fcc50494ac930ee6bec7f6b4f044 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 7 Mar 2022 13:54:02 -0800 Subject: [PATCH 04/41] #1265: replay: clean up and make more self-documenting --- src/vt/vrt/collection/balance/stats_replay.cc | 67 +++++++++---------- src/vt/vrt/collection/balance/stats_replay.h | 9 +-- 2 files changed, 32 insertions(+), 44 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index 9dc15269ab..f20b6d5630 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -74,7 +74,7 @@ void replayFromInputStats( std::shared_ptr lb_reassignment = nullptr; // allow remembering what objects are here after the load balancer migrates - std::set objects_here; + std::set migratable_objects_here; // simulate the requested number of phases auto const this_rank = theContext()->getNode(); @@ -141,7 +141,7 @@ void replayFromInputStats( auto norm_lb_proxy = LBStatsMigrator::construct(base_load_model); auto normalizer = norm_lb_proxy.get(); pre_lb_load_model = normalizer->createStatsAtHomeModel( - base_load_model, objects_here + base_load_model, migratable_objects_here ); norm_lb_proxy.destroyCollective(); }); @@ -152,7 +152,7 @@ void replayFromInputStats( auto norm_lb_proxy = LBStatsMigrator::construct(pre_lb_load_model); auto normalizer = norm_lb_proxy.get(); pre_lb_load_model = normalizer->createStatsHereModel( - pre_lb_load_model, objects_here + pre_lb_load_model, migratable_objects_here ); norm_lb_proxy.destroyCollective(); }); @@ -194,12 +194,12 @@ void replayFromInputStats( auto proposed_model = std::make_shared( pre_lb_load_model, lb_reassignment ); - objects_here.clear(); + migratable_objects_here.clear(); for (auto it = proposed_model->begin(); it.isValid(); ++it) { if ((*it).isMigratable()) { ObjIDType loc_id = *it; loc_id.curr_node = this_rank; - objects_here.insert(loc_id); + migratable_objects_here.insert(loc_id); vt_debug_print( normal, replay, "element {} is here on phase {} post-lb\n", @@ -209,7 +209,7 @@ void replayFromInputStats( } vt_debug_print( terse, replay, - "Post-lb num objects: {}\n", objects_here.size() + "Post-lb num objects: {}\n", migratable_objects_here.size() ); }); runInEpochCollective("StatsReplayDriver -> destroyLB", [&] { @@ -225,7 +225,6 @@ objgroup::proxy::Proxy LBStatsMigrator::construct(std::shared_ptr model_base) { auto my_proxy = theObjGroup()->makeCollective(); auto strat = my_proxy.get(); - strat->init(my_proxy); auto base_proxy = my_proxy.template registerBaseCollective(); vt_debug_print( verbose, replay, @@ -237,10 +236,6 @@ LBStatsMigrator::construct(std::shared_ptr model_base) { return my_proxy; } -void LBStatsMigrator::init(objgroup::proxy::Proxy in_proxy) { - proxy = in_proxy; -} - void LBStatsMigrator::runLB(TimeType) { } void LBStatsMigrator::inputParams(SpecEntry* spec) { } @@ -254,7 +249,7 @@ LBStatsMigrator::getInputKeysWithHelp() { std::shared_ptr LBStatsMigrator::createStatsAtHomeModel( std::shared_ptr model_base, - std::set objects_here + std::set migratable_objects_here ) { auto const this_rank = vt::theContext()->getNode(); vt_debug_print( @@ -268,7 +263,7 @@ LBStatsMigrator::createStatsAtHomeModel( // if the object belongs here, do nothing; otherwise, "transfer" it to // the home rank if (stat_obj_id.getHomeNode() != this_rank) { - if (objects_here.count(stat_obj_id) == 0) { + if (migratable_objects_here.count(stat_obj_id) == 0) { vt_debug_print( verbose, replay, "will transfer load of {} home to {}\n", @@ -302,7 +297,7 @@ LBStatsMigrator::createStatsAtHomeModel( std::shared_ptr LBStatsMigrator::createStatsHereModel( std::shared_ptr model_base, - std::set objects_here + std::set migratable_objects_here ) { auto const this_rank = vt::theContext()->getNode(); vt_debug_print( @@ -311,31 +306,29 @@ LBStatsMigrator::createStatsHereModel( ); runInEpochCollective("LBStatsMigrator -> transferStatsHere", [&] { - for (auto stat_obj_id : objects_here) { - if (stat_obj_id.isMigratable()) { - // if the object is already here, do nothing; otherwise, "transfer" it - // from the home rank - bool stats_here = false; - for (auto other_id : *model_base) { - if (stat_obj_id == other_id) { - stats_here = true; - break; - } + for (auto stat_obj_id : migratable_objects_here) { + // if the object is already here, do nothing; otherwise, "transfer" it + // from the home rank + bool stats_here = false; + for (auto other_id : *model_base) { + if (stat_obj_id == other_id) { + stats_here = true; + break; } - if (!stats_here) { - // check that this isn't something that should already have been here - assert(stat_obj_id.getHomeNode() != this_rank); + } + if (!stats_here) { + // check that this isn't something that should already have been here + assert(stat_obj_id.getHomeNode() != this_rank); - vt_debug_print( - verbose, replay, - "will transfer load of {} from home {}\n", - stat_obj_id, stat_obj_id.getHomeNode() - ); - ObjIDType mod_id = stat_obj_id; - // Override curr_node to force retrieval from the home rank - mod_id.curr_node = stat_obj_id.getHomeNode(); - migrateObjectTo(mod_id, this_rank); - } + vt_debug_print( + verbose, replay, + "will transfer load of {} from home {}\n", + stat_obj_id, stat_obj_id.getHomeNode() + ); + ObjIDType mod_id = stat_obj_id; + // Override curr_node to force retrieval from the home rank + mod_id.curr_node = stat_obj_id.getHomeNode(); + migrateObjectTo(mod_id, this_rank); } } }); diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/stats_replay.h index bf313b79e8..44efad4f03 100644 --- a/src/vt/vrt/collection/balance/stats_replay.h +++ b/src/vt/vrt/collection/balance/stats_replay.h @@ -71,8 +71,6 @@ struct LBStatsMigrator : lb::BaseLB { static objgroup::proxy::Proxy construct(std::shared_ptr model_base); - void init(objgroup::proxy::Proxy in_proxy); - void runLB(TimeType) override; void inputParams(SpecEntry* spec) override; @@ -84,17 +82,14 @@ struct LBStatsMigrator : lb::BaseLB { std::shared_ptr createStatsAtHomeModel( std::shared_ptr model_base, - std::set objects_here + std::set migratable_objects_here ); std::shared_ptr createStatsHereModel( std::shared_ptr model_base, - std::set objects_here + std::set migratable_objects_here ); - -private: - objgroup::proxy::Proxy proxy = {}; }; }}}} /* end namespace vt::vrt::collection::balance */ From 9af4c300fb70f9b83d260535274cad2055ade102 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 7 Mar 2022 15:19:53 -0800 Subject: [PATCH 05/41] #1265: replay: fix crash when no lb selected --- src/vt/vrt/collection/balance/stats_replay.cc | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index f20b6d5630..19e16f551a 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -75,6 +75,17 @@ void replayFromInputStats( // allow remembering what objects are here after the load balancer migrates std::set migratable_objects_here; + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + // point the load model at the stats for the relevant phase + runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { + base_load_model->updateLoads(initial_phase); + }); + for (auto stat_obj_id : *base_load_model) { + if (stat_obj_id.isMigratable()) { + migratable_objects_here.insert(stat_obj_id); + } + } // simulate the requested number of phases auto const this_rank = theContext()->getNode(); @@ -191,20 +202,22 @@ void replayFromInputStats( // instead, remember where the LB wanted to migrate objects lb_reassignment = theLBManager()->selectStartLB(phase); - auto proposed_model = std::make_shared( - pre_lb_load_model, lb_reassignment - ); - migratable_objects_here.clear(); - for (auto it = proposed_model->begin(); it.isValid(); ++it) { - if ((*it).isMigratable()) { - ObjIDType loc_id = *it; - loc_id.curr_node = this_rank; - migratable_objects_here.insert(loc_id); - vt_debug_print( - normal, replay, - "element {} is here on phase {} post-lb\n", - loc_id, phase - ); + if (lb_reassignment) { + auto proposed_model = std::make_shared( + pre_lb_load_model, lb_reassignment + ); + migratable_objects_here.clear(); + for (auto it = proposed_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + ObjIDType loc_id = *it; + loc_id.curr_node = this_rank; + migratable_objects_here.insert(loc_id); + vt_debug_print( + normal, replay, + "element {} is here on phase {} post-lb\n", + loc_id, phase + ); + } } } vt_debug_print( From 35020fe2074bd204d746117677207d5f78ce5954 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 8 Mar 2022 16:01:41 -0800 Subject: [PATCH 06/41] #1265: replay: move driver into tools --- CMakeLists.txt | 21 +++++++++++ examples/collection/CMakeLists.txt | 1 - tools/CMakeLists.txt | 35 +++++++++++++++++++ tools/stats_replay/CMakeLists.txt | 11 ++++++ .../stats_replay/simulate_replay.cc | 0 5 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 tools/CMakeLists.txt create mode 100644 tools/stats_replay/CMakeLists.txt rename examples/collection/stats_replay_driver.cc => tools/stats_replay/simulate_replay.cc (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a141f79cdf..9d8599bea8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ set(PROJECT_BIN_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(PROJECT_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PROJECT_LIB_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib) set(PROJECT_EXAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/examples) +set(PROJECT_TOOLS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools) # Import the linking macros for VT-related targets include(cmake/link_vt.cmake) @@ -114,6 +115,26 @@ if (VT_BUILD_TESTS include(CTest) endif() +# +# Tools +# +option(VT_BUILD_TOOLS "Build VT tools" ON) + +if (VT_BUILD_TOOLS) + message( + STATUS + "VT: building tools" + ) + + add_custom_target(tools) + add_subdirectory(tools) +else() + message( + STATUS "VT: NOT building tools because VT_BUILD_TOOLS is not set.\ + Tools that are not built are NOT TESTED." + ) +endif() + # # Examples # diff --git a/examples/collection/CMakeLists.txt b/examples/collection/CMakeLists.txt index e9eaf14fbb..edbd11d64e 100644 --- a/examples/collection/CMakeLists.txt +++ b/examples/collection/CMakeLists.txt @@ -9,7 +9,6 @@ set( insertable_collection reduce_integral transpose - stats_replay_driver ) foreach(EXAMPLE_NAME ${COLLECTION_EXAMPLES}) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 0000000000..674a291547 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,35 @@ + +# +# Tools +# + +include(turn_on_warnings) + +macro(add_tool tool_name) + set(TOOL_FILE "${tool_name}.cc") + + add_executable(${tool_name} ${TOOL_FILE}) + add_dependencies(tools ${tool_name}) + + turn_on_warnings(${tool_name}) + + if (vt_unity_build_enabled) + set_target_properties(${tool_name} PROPERTIES UNITY_BUILD ON) + endif() + + link_target_with_vt( + TARGET ${tool_name} + DEFAULT_LINK_SET + ) + +### @todo Add command-line arguments for testing +# if (BUILD_TESTING) +# add_test_for_example_vt( +# ${tool_name} +# ${TOOL_FILE} +# tool_tests +# ) +# endif() +endmacro() + +add_subdirectory(stats_replay) diff --git a/tools/stats_replay/CMakeLists.txt b/tools/stats_replay/CMakeLists.txt new file mode 100644 index 0000000000..a7bf5f104e --- /dev/null +++ b/tools/stats_replay/CMakeLists.txt @@ -0,0 +1,11 @@ + +set( + STATS_REPLAY_TOOLS + simulate_replay +) + +foreach(TOOL_NAME ${STATS_REPLAY_TOOLS}) + # message("Example: building stats replay tool >>>>> ${TOOL_NAME}") + + add_tool(${TOOL_NAME}) +endforeach() diff --git a/examples/collection/stats_replay_driver.cc b/tools/stats_replay/simulate_replay.cc similarity index 100% rename from examples/collection/stats_replay_driver.cc rename to tools/stats_replay/simulate_replay.cc From c8200258765df05f259203ef7a163d083df4bd8a Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 22 Mar 2022 12:46:27 -0700 Subject: [PATCH 07/41] #1265: replay: improve debugging --- src/vt/vrt/collection/balance/stats_replay.cc | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index 19e16f551a..1122cd80ba 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -67,6 +67,22 @@ void replayFromInputStats( auto json = r.readFile(); auto sd = StatsData(*json); + for (auto &phase_data : sd.node_data_) { + vt_debug_print( + normal, replay, + "found {} loads for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + + for (auto &phase_data : sd.node_comm_) { + vt_debug_print( + normal, replay, + "found {} comms for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + // remember vt's base load model auto base_load_model = theLBManager()->getBaseLoadModel(); @@ -274,7 +290,8 @@ LBStatsMigrator::createStatsAtHomeModel( for (auto stat_obj_id : *model_base) { if (stat_obj_id.isMigratable()) { // if the object belongs here, do nothing; otherwise, "transfer" it to - // the home rank + // the home rank so that it can later be sent to the rank holding the + // object if (stat_obj_id.getHomeNode() != this_rank) { if (migratable_objects_here.count(stat_obj_id) == 0) { vt_debug_print( @@ -321,7 +338,7 @@ LBStatsMigrator::createStatsHereModel( runInEpochCollective("LBStatsMigrator -> transferStatsHere", [&] { for (auto stat_obj_id : migratable_objects_here) { // if the object is already here, do nothing; otherwise, "transfer" it - // from the home rank + // from the home rank so that we will have the needed stats data bool stats_here = false; for (auto other_id : *model_base) { if (stat_obj_id == other_id) { From 02d09d74fb10d7d9e3fbd2403acdfa3c95c660c0 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 22 Mar 2022 13:14:48 -0700 Subject: [PATCH 08/41] #1265: replay: reduce redundant code --- src/vt/vrt/collection/balance/stats_replay.cc | 56 +++++++++---------- src/vt/vrt/collection/balance/stats_replay.h | 6 ++ 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index 1122cd80ba..1b8c82e2a8 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -275,6 +275,30 @@ LBStatsMigrator::getInputKeysWithHelp() { return keys_help; } +/*static*/ +std::shared_ptr +LBStatsMigrator::updateCurrentNodes( + std::shared_ptr lb_reassignment +) { + auto modified_reassignment = std::make_shared(); + modified_reassignment->node_ = lb_reassignment->node_; + modified_reassignment->global_migration_count = + lb_reassignment->global_migration_count; + for (auto &dep : lb_reassignment->depart_) { + ObjIDType id = dep.first; + NodeType dest = dep.second; + id.curr_node = dest; + modified_reassignment->depart_[id] = dest; + } + auto const this_rank = vt::theContext()->getNode(); + for (auto &arr : lb_reassignment->arrive_) { + ObjIDType id = arr.first; + id.curr_node = this_rank; + modified_reassignment->arrive_[id] = arr.second; + } + return modified_reassignment; +} + std::shared_ptr LBStatsMigrator::createStatsAtHomeModel( std::shared_ptr model_base, @@ -307,20 +331,7 @@ LBStatsMigrator::createStatsAtHomeModel( }); auto tmp_assignment = normalizeReassignments(); - auto home_assignment = std::make_shared(); - home_assignment->node_ = tmp_assignment->node_; - home_assignment->global_migration_count = tmp_assignment->global_migration_count; - for (auto &dep : tmp_assignment->depart_) { - ObjIDType id = dep.first; - NodeType dest = dep.second; - id.curr_node = dest; - home_assignment->depart_[id] = dest; - } - for (auto &arr : tmp_assignment->arrive_) { - ObjIDType id = arr.first; - id.curr_node = this_rank; - home_assignment->arrive_[id] = arr.second; - } + auto home_assignment = updateCurrentNodes(tmp_assignment); return std::make_shared(model_base, home_assignment); } @@ -364,22 +375,9 @@ LBStatsMigrator::createStatsHereModel( }); auto tmp_assignment = normalizeReassignments(); - // now restore the curr_node values to reflect the placement of the "real" object - auto here_assignment = std::make_shared(); - here_assignment->node_ = tmp_assignment->node_; - here_assignment->global_migration_count = tmp_assignment->global_migration_count; - for (auto &dep : tmp_assignment->depart_) { - ObjIDType id = dep.first; - NodeType dest = dep.second; - id.curr_node = dest; - here_assignment->depart_[id] = dest; - } - for (auto &arr : tmp_assignment->arrive_) { - ObjIDType id = arr.first; - id.curr_node = this_rank; - here_assignment->arrive_[id] = arr.second; - } + auto here_assignment = updateCurrentNodes(tmp_assignment); + return std::make_shared(model_base, here_assignment); } diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/stats_replay.h index 44efad4f03..4b06120d08 100644 --- a/src/vt/vrt/collection/balance/stats_replay.h +++ b/src/vt/vrt/collection/balance/stats_replay.h @@ -79,6 +79,12 @@ struct LBStatsMigrator : lb::BaseLB { using BaseLB::normalizeReassignments; + static + std::shared_ptr + updateCurrentNodes( + std::shared_ptr lb_reassignment + ); + std::shared_ptr createStatsAtHomeModel( std::shared_ptr model_base, From f59499a16933cbccbbab1517808614eb4b402867 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 7 Mar 2022 12:32:58 -0800 Subject: [PATCH 09/41] #1265: tests: first set of tests for workload replay --- .../unit/collection/test_lb_stats_migrator.cc | 325 ++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 tests/unit/collection/test_lb_stats_migrator.cc diff --git a/tests/unit/collection/test_lb_stats_migrator.cc b/tests/unit/collection/test_lb_stats_migrator.cc new file mode 100644 index 0000000000..4360b8134a --- /dev/null +++ b/tests/unit/collection/test_lb_stats_migrator.cc @@ -0,0 +1,325 @@ +/* +//@HEADER +// ***************************************************************************** +// +// test_lb_stats_migrator.cc +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include + +#include "test_parallel_harness.h" +#include "test_collection_common.h" + +#include "vt/elm/elm_id.h" +#include "vt/elm/elm_id_bits.h" +#include "vt/vrt/collection/balance/lb_common.h" +#include "vt/vrt/collection/balance/stats_data.h" +#include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" +#include "vt/vrt/collection/balance/stats_replay.h" +#include "vt/vrt/collection/balance/model/proposed_reassignment.h" + +#if vt_check_enabled(lblite) + +namespace vt { namespace tests { namespace unit { namespace reassignment { + +using namespace vt::tests::unit; + +struct TestLBStatsMigrator : TestParallelHarness { }; + +std::unique_ptr +setupStats(PhaseType phase, size_t numElements) { + auto const& this_node = vt::theContext()->getNode(); + + using vt::vrt::collection::balance::ElementIDStruct; + + std::vector myElemList(numElements); + + for (size_t ii = 0; ii < numElements; ++ii) { + myElemList[ii] = elm::ElmIDBits::createCollectionImpl( + true, ii+1, this_node, this_node + ); + } + + using vt::vrt::collection::balance::StatsData; + auto sd = std::make_unique(); + + for (auto&& elmID : myElemList) { + double tval = elmID.id * 2; + sd->node_data_[phase][elmID].whole_phase_load = tval; + } + + return std::move(sd); +} + + +TEST_F(TestLBStatsMigrator, test_normalize_call) { + auto const& this_node = vt::theContext()->getNode(); + auto const& num_nodes = vt::theContext()->getNumNodes(); + + PhaseType phase = 0; + const size_t numElements = 5; + + using vt::vrt::collection::balance::StatsData; + auto sd = setupStats(phase, numElements); + + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + using vt::vrt::collection::balance::LBStatsMigrator; + vt::objgroup::proxy::Proxy norm_lb_proxy; + using vt::vrt::collection::balance::ProposedReassignment; + std::shared_ptr new_model = nullptr; + + // choose a set of migrations for the load model to represent + vt::runInEpochCollective("do_lb", [&]{ + norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + auto normalizer = norm_lb_proxy.get(); + + vt::runInEpochCollective("choose migrations", [&]{ + for (auto obj_id : *base_load_model) { + if (obj_id.isMigratable()) { + vt::NodeType dest = obj_id.id % num_nodes; + normalizer->migrateObjectTo(obj_id, dest); + } + } + }); + + auto reassignment = normalizer->normalizeReassignments(); + new_model = std::make_shared( + base_load_model, LBStatsMigrator::updateCurrentNodes(reassignment) + ); + }); + vt::runInEpochCollective("destroy lb", [&]{ + norm_lb_proxy.destroyCollective(); + }); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *new_model) { + if (obj_id.isMigratable()) { + vt::NodeType dest = obj_id.id % num_nodes; + EXPECT_EQ(dest, this_node); + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = new_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + +TEST_F(TestLBStatsMigrator, test_move_data_home) { + auto const& this_node = vt::theContext()->getNode(); + + PhaseType phase = 0; + const size_t numElements = 5; + + using vt::vrt::collection::balance::StatsData; + auto sd = setupStats(phase, numElements); + + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::ProposedReassignment; + using vt::vrt::collection::balance::LBType; + using ObjIDType = vt::elm::ElementIDStruct; + std::shared_ptr not_home_model = nullptr; + + // move everything off the home node + vt::runInEpochCollective("do shift", [&]{ + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); + if (lb_reassignment != nullptr) { + fmt::print( + "{}: global_mig={}, depart={}, arrive={}\n", + lb_reassignment->node_, + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + not_home_model = std::make_shared( + base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + ); + } + }); + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + // list nothing as here so that we skip the optimization + std::set no_migratable_objects_here; + + vt::objgroup::proxy::Proxy norm_lb_proxy; + std::shared_ptr back_home_model = nullptr; + + // then create a load model that restores them to homes + vt::runInEpochCollective("migrate stats home", [&]{ + norm_lb_proxy = LBStatsMigrator::construct(not_home_model); + auto normalizer = norm_lb_proxy.get(); + + back_home_model = normalizer->createStatsAtHomeModel( + not_home_model, no_migratable_objects_here + ); + }); + runInEpochCollective("destroy migrator", [&]{ + norm_lb_proxy.destroyCollective(); + }); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *back_home_model) { + if (obj_id.isMigratable()) { + auto home = obj_id.getHomeNode(); + EXPECT_EQ(home, this_node); + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = back_home_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + +TEST_F(TestLBStatsMigrator, test_move_some_data_home) { + auto const& this_node = vt::theContext()->getNode(); + auto const& num_nodes = vt::theContext()->getNumNodes(); + + PhaseType phase = 0; + const size_t numElements = 5; + + using vt::vrt::collection::balance::StatsData; + auto sd = setupStats(phase, numElements); + + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::ProposedReassignment; + using vt::vrt::collection::balance::LBType; + using ObjIDType = vt::elm::ElementIDStruct; + std::set migratable_objects_here; + std::shared_ptr not_home_model = nullptr; + + // move everything off the home node + vt::runInEpochCollective("do shift", [&]{ + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); + if (lb_reassignment != nullptr) { + fmt::print( + "{}: global_mig={}, depart={}, arrive={}\n", + lb_reassignment->node_, + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + not_home_model = std::make_shared( + base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + ); + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + // only claim a subset of them are here (relates to an optimization in + // the code being tested) + if ((*it).id % 3 == 0) { + migratable_objects_here.insert(*it); + } + } + } + } + }); + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + vt::objgroup::proxy::Proxy norm_lb_proxy; + std::shared_ptr back_home_if_not_here_model = nullptr; + + // then create a load model that restores them to homes + vt::runInEpochCollective("migrate stats home", [&]{ + norm_lb_proxy = LBStatsMigrator::construct(not_home_model); + auto normalizer = norm_lb_proxy.get(); + + back_home_if_not_here_model = normalizer->createStatsAtHomeModel( + not_home_model, migratable_objects_here + ); + }); + runInEpochCollective("destroy migrator", [&]{ + norm_lb_proxy.destroyCollective(); + }); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *back_home_if_not_here_model) { + if (obj_id.isMigratable()) { + auto home = obj_id.getHomeNode(); + if (obj_id.id % 3 == 0) { + // the optimization should have prevented these from moving home + EXPECT_EQ(home, (this_node + num_nodes - 1) % num_nodes); + } else { + // but these must be home now + EXPECT_EQ(home, this_node); + } + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = back_home_if_not_here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + +}}}} // end namespace vt::tests::unit::reassignment + +#endif /*vt_check_enabled(lblite)*/ From 6b2b53487ef05066468ae4608afa26d683131feb Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 11:42:57 -0700 Subject: [PATCH 10/41] #1265: tests: additional tests of workload replay --- .../unit/collection/test_lb_stats_migrator.cc | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/tests/unit/collection/test_lb_stats_migrator.cc b/tests/unit/collection/test_lb_stats_migrator.cc index 4360b8134a..58f623f9e4 100644 --- a/tests/unit/collection/test_lb_stats_migrator.cc +++ b/tests/unit/collection/test_lb_stats_migrator.cc @@ -320,6 +320,183 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_home) { } } +TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { + auto const& this_node = vt::theContext()->getNode(); + auto const& num_nodes = vt::theContext()->getNumNodes(); + + PhaseType phase = 0; + const size_t numElements = 5; + + using vt::vrt::collection::balance::StatsData; + auto sd = setupStats(phase, numElements); + + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::ProposedReassignment; + using vt::vrt::collection::balance::LBType; + using ObjIDType = vt::elm::ElementIDStruct; + std::set migratable_objects_here; + std::shared_ptr not_home_model = nullptr; + + // move everything off the home node + vt::runInEpochCollective("do shift", [&]{ + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); + if (lb_reassignment != nullptr) { + fmt::print( + "{}: global_mig={}, depart={}, arrive={}\n", + lb_reassignment->node_, + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + not_home_model = std::make_shared( + base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + ); + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); + } + } + } + }); + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + vt::objgroup::proxy::Proxy norm_lb_proxy; + std::shared_ptr here_model = nullptr; + + // then create a load model that pulls loads here from home, + // based on the base load model, not the one we just created + vt::runInEpochCollective("migrate stats here", [&]{ + norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + auto normalizer = norm_lb_proxy.get(); + + here_model = normalizer->createStatsHereModel( + base_load_model, migratable_objects_here + ); + }); + runInEpochCollective("destroy migrator", [&]{ + norm_lb_proxy.destroyCollective(); + }); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *here_model) { + if (obj_id.isMigratable()) { + auto home = obj_id.getHomeNode(); + EXPECT_EQ(home, (this_node + num_nodes - 1) % num_nodes); + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + +TEST_F(TestLBStatsMigrator, test_move_some_data_here_from_home) { + auto const& this_node = vt::theContext()->getNode(); + auto const& num_nodes = vt::theContext()->getNumNodes(); + + PhaseType phase = 0; + const size_t numElements = 5; + + using vt::vrt::collection::balance::StatsData; + auto sd = setupStats(phase, numElements); + + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json stats, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::ProposedReassignment; + using vt::vrt::collection::balance::LBType; + using ObjIDType = vt::elm::ElementIDStruct; + std::set migratable_objects_here; + std::shared_ptr not_home_model = nullptr; + + // move everything off the home node + vt::runInEpochCollective("do shift", [&]{ + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); + if (lb_reassignment != nullptr) { + fmt::print( + "{}: global_mig={}, depart={}, arrive={}\n", + lb_reassignment->node_, + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + not_home_model = std::make_shared( + base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + ); + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + // only claim a subset of them are here (relates to an optimization in + // the code being tested) + if ((*it).id % 3 == 0) { + migratable_objects_here.insert(*it); + } + } + } + } + }); + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + vt::objgroup::proxy::Proxy norm_lb_proxy; + std::shared_ptr here_model = nullptr; + + // then create a load model that pulls loads here from home, + // based on the base load model, not the one we just created + vt::runInEpochCollective("migrate stats here", [&]{ + norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + auto normalizer = norm_lb_proxy.get(); + + here_model = normalizer->createStatsHereModel( + base_load_model, migratable_objects_here + ); + }); + runInEpochCollective("destroy migrator", [&]{ + norm_lb_proxy.destroyCollective(); + }); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *here_model) { + if (obj_id.isMigratable()) { + auto home = obj_id.getHomeNode(); + if (obj_id.id % 3 == 0) { + // these must have moved here from home + EXPECT_EQ(home, (this_node + num_nodes - 1) % num_nodes); + } else { + // but the optimization should have prevented these from moving away + // from home + EXPECT_EQ(home, this_node); + } + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + }}}} // end namespace vt::tests::unit::reassignment #endif /*vt_check_enabled(lblite)*/ From 0405dca96fccedb501cd210382bef57f20ce50de Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 11:58:15 -0700 Subject: [PATCH 11/41] #1265: replay: refactor for better testing --- src/vt/vrt/collection/balance/stats_replay.cc | 57 +++++++++++-------- src/vt/vrt/collection/balance/stats_replay.h | 7 ++- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index 1b8c82e2a8..f8832148f6 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -58,30 +58,11 @@ namespace balance { void replayFromInputStats( PhaseType initial_phase, PhaseType phases_to_run ) { - using util::json::Reader; using ObjIDType = elm::ElementIDStruct; // read in object loads from json files auto const filename = theConfig()->getLBStatsFileIn(); - Reader r{filename}; - auto json = r.readFile(); - auto sd = StatsData(*json); - - for (auto &phase_data : sd.node_data_) { - vt_debug_print( - normal, replay, - "found {} loads for phase {}\n", - phase_data.second.size(), phase_data.first - ); - } - - for (auto &phase_data : sd.node_comm_) { - vt_debug_print( - normal, replay, - "found {} comms for phase {}\n", - phase_data.second.size(), phase_data.first - ); - } + auto sd = LBStatsMigrator::readInWorkloads(filename); // remember vt's base load model auto base_load_model = theLBManager()->getBaseLoadModel(); @@ -92,7 +73,7 @@ void replayFromInputStats( // allow remembering what objects are here after the load balancer migrates std::set migratable_objects_here; // force it to use our json stats, not anything it may have collected - base_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); // point the load model at the stats for the relevant phase runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { base_load_model->updateLoads(initial_phase); @@ -111,7 +92,7 @@ void replayFromInputStats( theLBManager()->setLoadModel(base_load_model); // force it to use our json stats, not anything it may have collected - base_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); // point the load model at the stats for the relevant phase runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { @@ -173,7 +154,7 @@ void replayFromInputStats( norm_lb_proxy.destroyCollective(); }); theLBManager()->setLoadModel(pre_lb_load_model); - pre_lb_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); runInEpochCollective("StatsReplayDriver -> migrateStatsDataHere", [&] { auto norm_lb_proxy = LBStatsMigrator::construct(pre_lb_load_model); @@ -184,7 +165,7 @@ void replayFromInputStats( norm_lb_proxy.destroyCollective(); }); theLBManager()->setLoadModel(pre_lb_load_model); - pre_lb_load_model->setLoads(&sd.node_data_, &sd.node_comm_); + pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); } if (this_rank == 0) { @@ -299,6 +280,34 @@ LBStatsMigrator::updateCurrentNodes( return modified_reassignment; } +/*static*/ +std::shared_ptr +LBStatsMigrator::readInWorkloads(std::string filename) { + using util::json::Reader; + + Reader r{filename}; + auto json = r.readFile(); + auto sd = std::make_shared(*json); + + for (auto &phase_data : sd->node_data_) { + vt_debug_print( + normal, replay, + "found {} loads for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + + for (auto &phase_data : sd->node_comm_) { + vt_debug_print( + normal, replay, + "found {} comms for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + + return sd; +} + std::shared_ptr LBStatsMigrator::createStatsAtHomeModel( std::shared_ptr model_base, diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/stats_replay.h index 4b06120d08..0126c1d8de 100644 --- a/src/vt/vrt/collection/balance/stats_replay.h +++ b/src/vt/vrt/collection/balance/stats_replay.h @@ -47,6 +47,7 @@ #include "vt/config.h" #include "vt/elm/elm_id.h" +#include "vt/vrt/collection/balance/stats_data.h" #include "vt/vrt/collection/balance/baselb/baselb.h" #include "vt/vrt/collection/balance/model/load_model.h" #include "vt/vrt/collection/balance/model/proposed_reassignment.h" @@ -79,12 +80,14 @@ struct LBStatsMigrator : lb::BaseLB { using BaseLB::normalizeReassignments; - static - std::shared_ptr + static std::shared_ptr updateCurrentNodes( std::shared_ptr lb_reassignment ); + static std::shared_ptr + readInWorkloads(std::string filename); + std::shared_ptr createStatsAtHomeModel( std::shared_ptr model_base, From 2d507600d23607bdffeffa360ff397912d969647 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 12:15:57 -0700 Subject: [PATCH 12/41] #1265: replay: strike all refs to stats except filenames --- src/vt/vrt/collection/balance/stats_replay.cc | 83 ++++++++-------- src/vt/vrt/collection/balance/stats_replay.h | 12 +-- .../unit/collection/test_lb_stats_migrator.cc | 94 ++++++++++--------- tools/stats_replay/simulate_replay.cc | 2 +- 4 files changed, 98 insertions(+), 93 deletions(-) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/stats_replay.cc index f8832148f6..c61213bc62 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/stats_replay.cc @@ -55,14 +55,14 @@ namespace vt { namespace vrt { namespace collection { namespace balance { -void replayFromInputStats( +void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run ) { using ObjIDType = elm::ElementIDStruct; // read in object loads from json files auto const filename = theConfig()->getLBStatsFileIn(); - auto sd = LBStatsMigrator::readInWorkloads(filename); + auto sd = WorkloadDataMigrator::readInWorkloads(filename); // remember vt's base load model auto base_load_model = theLBManager()->getBaseLoadModel(); @@ -72,10 +72,10 @@ void replayFromInputStats( // allow remembering what objects are here after the load balancer migrates std::set migratable_objects_here; - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); - // point the load model at the stats for the relevant phase - runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { + // point the load model at the workloads for the relevant phase + runInEpochCollective("WorkloadReplayDriver -> updateLoads", [=] { base_load_model->updateLoads(initial_phase); }); for (auto stat_obj_id : *base_load_model) { @@ -91,11 +91,11 @@ void replayFromInputStats( // reapply the base load model if in case we overwrote it on a previous iter theLBManager()->setLoadModel(base_load_model); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); - // point the load model at the stats for the relevant phase - runInEpochCollective("StatsReplayDriver -> updateLoads", [=] { + // point the load model at the workloads for the relevant phase + runInEpochCollective("WorkloadReplayDriver -> updateLoads", [=] { base_load_model->updateLoads(phase); }); @@ -105,7 +105,7 @@ void replayFromInputStats( ++count; vt_debug_print( normal, replay, - "stats for id {} are here on phase {}\n", + "workloads for id {} are here on phase {}\n", stat_obj_id, phase ); } @@ -118,14 +118,14 @@ void replayFromInputStats( auto pre_lb_load_model = base_load_model; - // if this isn't the initial phase, then the stats may exist on a rank + // if this isn't the initial phase, then the workloads may exist on a rank // other than where the objects are currently meant to exist; we will - // use a Reassignment object to get those load stats where they need to be + // use a Reassignment object to get those workloads where they need to be if (phase > initial_phase) { if (this_rank == 0) { vt_print( replay, - "Migrating imported object stats to phase {} ranks...\n", + "Migrating imported object workloads to phase {} ranks...\n", phase ); } @@ -137,18 +137,18 @@ void replayFromInputStats( // rank from which the lb removed the object; the curr_node member of // the object ids in the lb_reassignment object refers to the pre-lb // location on the previous phase, but the curr_node member for our new - // load model must point to where the stats data exists for this phase + // load model must point to where the workloads data exists for this phase - // the stats data for this phase can exist at arbitrary locations; the + // the workloads data for this phase can exist at arbitrary locations; the // only rank to know the location of this data is the one that has it; // this will be the departing node for the purposes of this load model; // we need to make sure the curr_node member of the object ids in our - // new load model points to the node on which the stats data lives + // new load model points to the node on which the workloads data lives - runInEpochCollective("StatsReplayDriver -> migrateStatsDataHome", [&] { - auto norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + runInEpochCollective("WorkloadReplayDriver -> migrateStatsDataHome", [&] { + auto norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); auto normalizer = norm_lb_proxy.get(); - pre_lb_load_model = normalizer->createStatsAtHomeModel( + pre_lb_load_model = normalizer->createModelToMoveWorkloadsHome( base_load_model, migratable_objects_here ); norm_lb_proxy.destroyCollective(); @@ -156,10 +156,10 @@ void replayFromInputStats( theLBManager()->setLoadModel(pre_lb_load_model); pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); - runInEpochCollective("StatsReplayDriver -> migrateStatsDataHere", [&] { - auto norm_lb_proxy = LBStatsMigrator::construct(pre_lb_load_model); + runInEpochCollective("WorkloadReplayDriver -> migrateStatsDataHere", [&] { + auto norm_lb_proxy = WorkloadDataMigrator::construct(pre_lb_load_model); auto normalizer = norm_lb_proxy.get(); - pre_lb_load_model = normalizer->createStatsHereModel( + pre_lb_load_model = normalizer->createModelToMoveWorkloadsHere( pre_lb_load_model, migratable_objects_here ); norm_lb_proxy.destroyCollective(); @@ -194,7 +194,7 @@ void replayFromInputStats( "constructing load model from real load balancer\n" ); - runInEpochCollective("StatsReplayDriver -> runRealLB", [&] { + runInEpochCollective("WorkloadReplayDriver -> runRealLB", [&] { // run the load balancer but don't let it automatically migrate; // instead, remember where the LB wanted to migrate objects lb_reassignment = theLBManager()->selectStartLB(phase); @@ -222,7 +222,7 @@ void replayFromInputStats( "Post-lb num objects: {}\n", migratable_objects_here.size() ); }); - runInEpochCollective("StatsReplayDriver -> destroyLB", [&] { + runInEpochCollective("WorkloadReplayDriver -> destroyLB", [&] { theLBManager()->destroyLB(); }); theCollective()->barrier(); @@ -231,14 +231,14 @@ void replayFromInputStats( /*static*/ -objgroup::proxy::Proxy -LBStatsMigrator::construct(std::shared_ptr model_base) { - auto my_proxy = theObjGroup()->makeCollective(); +objgroup::proxy::Proxy +WorkloadDataMigrator::construct(std::shared_ptr model_base) { + auto my_proxy = theObjGroup()->makeCollective(); auto strat = my_proxy.get(); auto base_proxy = my_proxy.template registerBaseCollective(); vt_debug_print( verbose, replay, - "LBStatsMigrator proxy={} base_proxy={}\n", + "WorkloadDataMigrator proxy={} base_proxy={}\n", my_proxy.getProxy(), base_proxy.getProxy() ); strat->proxy_ = base_proxy; @@ -246,19 +246,19 @@ LBStatsMigrator::construct(std::shared_ptr model_base) { return my_proxy; } -void LBStatsMigrator::runLB(TimeType) { } +void WorkloadDataMigrator::runLB(TimeType) { } -void LBStatsMigrator::inputParams(SpecEntry* spec) { } +void WorkloadDataMigrator::inputParams(SpecEntry* spec) { } std::unordered_map -LBStatsMigrator::getInputKeysWithHelp() { +WorkloadDataMigrator::getInputKeysWithHelp() { std::unordered_map const keys_help; return keys_help; } /*static*/ std::shared_ptr -LBStatsMigrator::updateCurrentNodes( +WorkloadDataMigrator::updateCurrentNodes( std::shared_ptr lb_reassignment ) { auto modified_reassignment = std::make_shared(); @@ -282,7 +282,7 @@ LBStatsMigrator::updateCurrentNodes( /*static*/ std::shared_ptr -LBStatsMigrator::readInWorkloads(std::string filename) { +WorkloadDataMigrator::readInWorkloads(std::string filename) { using util::json::Reader; Reader r{filename}; @@ -309,7 +309,7 @@ LBStatsMigrator::readInWorkloads(std::string filename) { } std::shared_ptr -LBStatsMigrator::createStatsAtHomeModel( +WorkloadDataMigrator::createModelToMoveWorkloadsHome( std::shared_ptr model_base, std::set migratable_objects_here ) { @@ -319,7 +319,7 @@ LBStatsMigrator::createStatsAtHomeModel( "constructing load model to get loads from file location to home\n" ); - runInEpochCollective("LBStatsMigrator -> transferStatsHome", [&] { + runInEpochCollective("WorkloadDataMigrator -> transferStatsHome", [&] { for (auto stat_obj_id : *model_base) { if (stat_obj_id.isMigratable()) { // if the object belongs here, do nothing; otherwise, "transfer" it to @@ -345,7 +345,7 @@ LBStatsMigrator::createStatsAtHomeModel( } std::shared_ptr -LBStatsMigrator::createStatsHereModel( +WorkloadDataMigrator::createModelToMoveWorkloadsHere( std::shared_ptr model_base, std::set migratable_objects_here ) { @@ -355,18 +355,18 @@ LBStatsMigrator::createStatsHereModel( "constructing load model to get loads from home to here\n" ); - runInEpochCollective("LBStatsMigrator -> transferStatsHere", [&] { + runInEpochCollective("WorkloadDataMigrator -> transferStatsHere", [&] { for (auto stat_obj_id : migratable_objects_here) { // if the object is already here, do nothing; otherwise, "transfer" it - // from the home rank so that we will have the needed stats data - bool stats_here = false; + // from the home rank so that we will have the needed workloads data + bool workloads_here = false; for (auto other_id : *model_base) { if (stat_obj_id == other_id) { - stats_here = true; + workloads_here = true; break; } } - if (!stats_here) { + if (!workloads_here) { // check that this isn't something that should already have been here assert(stat_obj_id.getHomeNode() != this_rank); @@ -384,7 +384,8 @@ LBStatsMigrator::createStatsHereModel( }); auto tmp_assignment = normalizeReassignments(); - // now restore the curr_node values to reflect the placement of the "real" object + // now restore the curr_node values to reflect the placement of the "real" + // object auto here_assignment = updateCurrentNodes(tmp_assignment); return std::make_shared(model_base, here_assignment); diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/stats_replay.h index 0126c1d8de..dcc100c3ab 100644 --- a/src/vt/vrt/collection/balance/stats_replay.h +++ b/src/vt/vrt/collection/balance/stats_replay.h @@ -59,17 +59,17 @@ namespace vt { namespace vrt { namespace collection { namespace balance { -void replayFromInputStats( +void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run ); -struct LBStatsMigrator : lb::BaseLB { +struct WorkloadDataMigrator : lb::BaseLB { using ObjIDType = elm::ElementIDStruct; - LBStatsMigrator() = default; + WorkloadDataMigrator() = default; - static objgroup::proxy::Proxy + static objgroup::proxy::Proxy construct(std::shared_ptr model_base); void runLB(TimeType) override; @@ -89,13 +89,13 @@ struct LBStatsMigrator : lb::BaseLB { readInWorkloads(std::string filename); std::shared_ptr - createStatsAtHomeModel( + createModelToMoveWorkloadsHome( std::shared_ptr model_base, std::set migratable_objects_here ); std::shared_ptr - createStatsHereModel( + createModelToMoveWorkloadsHere( std::shared_ptr model_base, std::set migratable_objects_here ); diff --git a/tests/unit/collection/test_lb_stats_migrator.cc b/tests/unit/collection/test_lb_stats_migrator.cc index 58f623f9e4..1a1c812c42 100644 --- a/tests/unit/collection/test_lb_stats_migrator.cc +++ b/tests/unit/collection/test_lb_stats_migrator.cc @@ -60,10 +60,10 @@ namespace vt { namespace tests { namespace unit { namespace reassignment { using namespace vt::tests::unit; -struct TestLBStatsMigrator : TestParallelHarness { }; +struct TestWorkloadDataMigrator : TestParallelHarness { }; std::unique_ptr -setupStats(PhaseType phase, size_t numElements) { +setupWorkloads(PhaseType phase, size_t numElements) { auto const& this_node = vt::theContext()->getNode(); using vt::vrt::collection::balance::ElementIDStruct; @@ -88,7 +88,7 @@ setupStats(PhaseType phase, size_t numElements) { } -TEST_F(TestLBStatsMigrator, test_normalize_call) { +TEST_F(TestWorkloadDataMigrator, test_normalize_call) { auto const& this_node = vt::theContext()->getNode(); auto const& num_nodes = vt::theContext()->getNumNodes(); @@ -96,24 +96,24 @@ TEST_F(TestLBStatsMigrator, test_normalize_call) { const size_t numElements = 5; using vt::vrt::collection::balance::StatsData; - auto sd = setupStats(phase, numElements); + auto sd = setupWorkloads(phase, numElements); auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); }); - using vt::vrt::collection::balance::LBStatsMigrator; - vt::objgroup::proxy::Proxy norm_lb_proxy; + using vt::vrt::collection::balance::WorkloadDataMigrator; + vt::objgroup::proxy::Proxy norm_lb_proxy; using vt::vrt::collection::balance::ProposedReassignment; std::shared_ptr new_model = nullptr; // choose a set of migrations for the load model to represent vt::runInEpochCollective("do_lb", [&]{ - norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); auto normalizer = norm_lb_proxy.get(); vt::runInEpochCollective("choose migrations", [&]{ @@ -127,7 +127,7 @@ TEST_F(TestLBStatsMigrator, test_normalize_call) { auto reassignment = normalizer->normalizeReassignments(); new_model = std::make_shared( - base_load_model, LBStatsMigrator::updateCurrentNodes(reassignment) + base_load_model, WorkloadDataMigrator::updateCurrentNodes(reassignment) ); }); vt::runInEpochCollective("destroy lb", [&]{ @@ -150,24 +150,24 @@ TEST_F(TestLBStatsMigrator, test_normalize_call) { } } -TEST_F(TestLBStatsMigrator, test_move_data_home) { +TEST_F(TestWorkloadDataMigrator, test_move_data_home) { auto const& this_node = vt::theContext()->getNode(); PhaseType phase = 0; const size_t numElements = 5; using vt::vrt::collection::balance::StatsData; - auto sd = setupStats(phase, numElements); + auto sd = setupWorkloads(phase, numElements); auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); }); - using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::WorkloadDataMigrator; using vt::vrt::collection::balance::ProposedReassignment; using vt::vrt::collection::balance::LBType; using ObjIDType = vt::elm::ElementIDStruct; @@ -185,7 +185,8 @@ TEST_F(TestLBStatsMigrator, test_move_data_home) { lb_reassignment->arrive_.size() ); not_home_model = std::make_shared( - base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) ); } }); @@ -196,15 +197,15 @@ TEST_F(TestLBStatsMigrator, test_move_data_home) { // list nothing as here so that we skip the optimization std::set no_migratable_objects_here; - vt::objgroup::proxy::Proxy norm_lb_proxy; + vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr back_home_model = nullptr; // then create a load model that restores them to homes - vt::runInEpochCollective("migrate stats home", [&]{ - norm_lb_proxy = LBStatsMigrator::construct(not_home_model); + vt::runInEpochCollective("migrate workloads home", [&]{ + norm_lb_proxy = WorkloadDataMigrator::construct(not_home_model); auto normalizer = norm_lb_proxy.get(); - back_home_model = normalizer->createStatsAtHomeModel( + back_home_model = normalizer->createModelToMoveWorkloadsHome( not_home_model, no_migratable_objects_here ); }); @@ -228,7 +229,7 @@ TEST_F(TestLBStatsMigrator, test_move_data_home) { } } -TEST_F(TestLBStatsMigrator, test_move_some_data_home) { +TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { auto const& this_node = vt::theContext()->getNode(); auto const& num_nodes = vt::theContext()->getNumNodes(); @@ -236,17 +237,17 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_home) { const size_t numElements = 5; using vt::vrt::collection::balance::StatsData; - auto sd = setupStats(phase, numElements); + auto sd = setupWorkloads(phase, numElements); auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); }); - using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::WorkloadDataMigrator; using vt::vrt::collection::balance::ProposedReassignment; using vt::vrt::collection::balance::LBType; using ObjIDType = vt::elm::ElementIDStruct; @@ -265,7 +266,8 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_home) { lb_reassignment->arrive_.size() ); not_home_model = std::make_shared( - base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) ); for (auto it = not_home_model->begin(); it.isValid(); ++it) { if ((*it).isMigratable()) { @@ -282,15 +284,15 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_home) { vt::theLBManager()->destroyLB(); }); - vt::objgroup::proxy::Proxy norm_lb_proxy; + vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr back_home_if_not_here_model = nullptr; // then create a load model that restores them to homes - vt::runInEpochCollective("migrate stats home", [&]{ - norm_lb_proxy = LBStatsMigrator::construct(not_home_model); + vt::runInEpochCollective("migrate workloads home", [&]{ + norm_lb_proxy = WorkloadDataMigrator::construct(not_home_model); auto normalizer = norm_lb_proxy.get(); - back_home_if_not_here_model = normalizer->createStatsAtHomeModel( + back_home_if_not_here_model = normalizer->createModelToMoveWorkloadsHome( not_home_model, migratable_objects_here ); }); @@ -320,7 +322,7 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_home) { } } -TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { +TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { auto const& this_node = vt::theContext()->getNode(); auto const& num_nodes = vt::theContext()->getNumNodes(); @@ -328,17 +330,17 @@ TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { const size_t numElements = 5; using vt::vrt::collection::balance::StatsData; - auto sd = setupStats(phase, numElements); + auto sd = setupWorkloads(phase, numElements); auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); }); - using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::WorkloadDataMigrator; using vt::vrt::collection::balance::ProposedReassignment; using vt::vrt::collection::balance::LBType; using ObjIDType = vt::elm::ElementIDStruct; @@ -357,7 +359,8 @@ TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { lb_reassignment->arrive_.size() ); not_home_model = std::make_shared( - base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) ); for (auto it = not_home_model->begin(); it.isValid(); ++it) { if ((*it).isMigratable()) { @@ -370,16 +373,16 @@ TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { vt::theLBManager()->destroyLB(); }); - vt::objgroup::proxy::Proxy norm_lb_proxy; + vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr here_model = nullptr; // then create a load model that pulls loads here from home, // based on the base load model, not the one we just created - vt::runInEpochCollective("migrate stats here", [&]{ - norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + vt::runInEpochCollective("migrate workloads here", [&]{ + norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); auto normalizer = norm_lb_proxy.get(); - here_model = normalizer->createStatsHereModel( + here_model = normalizer->createModelToMoveWorkloadsHere( base_load_model, migratable_objects_here ); }); @@ -403,7 +406,7 @@ TEST_F(TestLBStatsMigrator, test_move_data_here_from_home) { } } -TEST_F(TestLBStatsMigrator, test_move_some_data_here_from_home) { +TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { auto const& this_node = vt::theContext()->getNode(); auto const& num_nodes = vt::theContext()->getNumNodes(); @@ -411,17 +414,17 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_here_from_home) { const size_t numElements = 5; using vt::vrt::collection::balance::StatsData; - auto sd = setupStats(phase, numElements); + auto sd = setupWorkloads(phase, numElements); auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json stats, not anything it may have collected + // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); }); - using vt::vrt::collection::balance::LBStatsMigrator; + using vt::vrt::collection::balance::WorkloadDataMigrator; using vt::vrt::collection::balance::ProposedReassignment; using vt::vrt::collection::balance::LBType; using ObjIDType = vt::elm::ElementIDStruct; @@ -440,7 +443,8 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_here_from_home) { lb_reassignment->arrive_.size() ); not_home_model = std::make_shared( - base_load_model, LBStatsMigrator::updateCurrentNodes(lb_reassignment) + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) ); for (auto it = not_home_model->begin(); it.isValid(); ++it) { if ((*it).isMigratable()) { @@ -457,16 +461,16 @@ TEST_F(TestLBStatsMigrator, test_move_some_data_here_from_home) { vt::theLBManager()->destroyLB(); }); - vt::objgroup::proxy::Proxy norm_lb_proxy; + vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr here_model = nullptr; // then create a load model that pulls loads here from home, // based on the base load model, not the one we just created - vt::runInEpochCollective("migrate stats here", [&]{ - norm_lb_proxy = LBStatsMigrator::construct(base_load_model); + vt::runInEpochCollective("migrate workloads here", [&]{ + norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); auto normalizer = norm_lb_proxy.get(); - here_model = normalizer->createStatsHereModel( + here_model = normalizer->createModelToMoveWorkloadsHere( base_load_model, migratable_objects_here ); }); diff --git a/tools/stats_replay/simulate_replay.cc b/tools/stats_replay/simulate_replay.cc index 398673a172..095a9a933a 100644 --- a/tools/stats_replay/simulate_replay.cc +++ b/tools/stats_replay/simulate_replay.cc @@ -60,7 +60,7 @@ int main(int argc, char** argv) { // number of phases to simulate PhaseType phases_to_run = atoi(argv[2]); - vt::vrt::collection::balance::replayFromInputStats( + vt::vrt::collection::balance::replayWorkloads( initial_phase, phases_to_run ); From e1800e30d64ac26308b49d505ea2309b99807f52 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 12:26:58 -0700 Subject: [PATCH 13/41] #1265: replay: updated filenames to not ref stats --- .../balance/{stats_replay.cc => workload_replay.cc} | 4 ++-- .../balance/{stats_replay.h => workload_replay.h} | 8 ++++---- ...ats_migrator.cc => test_workload_data_migrator.cc} | 4 ++-- tools/CMakeLists.txt | 2 +- tools/stats_replay/CMakeLists.txt | 11 ----------- tools/workload_replay/CMakeLists.txt | 11 +++++++++++ .../simulate_replay.cc | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) rename src/vt/vrt/collection/balance/{stats_replay.cc => workload_replay.cc} (99%) rename src/vt/vrt/collection/balance/{stats_replay.h => workload_replay.h} (93%) rename tests/unit/collection/{test_lb_stats_migrator.cc => test_workload_data_migrator.cc} (99%) delete mode 100644 tools/stats_replay/CMakeLists.txt create mode 100644 tools/workload_replay/CMakeLists.txt rename tools/{stats_replay => workload_replay}/simulate_replay.cc (96%) diff --git a/src/vt/vrt/collection/balance/stats_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc similarity index 99% rename from src/vt/vrt/collection/balance/stats_replay.cc rename to src/vt/vrt/collection/balance/workload_replay.cc index c61213bc62..d1fd12f7b3 100644 --- a/src/vt/vrt/collection/balance/stats_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// stats_replay.cc +// workload_replay.cc // DARMA Toolkit v. 1.0.0 // DARMA/vt => Virtual Transport // @@ -43,7 +43,7 @@ */ #include "vt/config.h" -#include "vt/vrt/collection/balance/stats_replay.h" +#include "vt/vrt/collection/balance/workload_replay.h" #include "vt/vrt/collection/balance/stats_data.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" #include "vt/utils/json/json_reader.h" diff --git a/src/vt/vrt/collection/balance/stats_replay.h b/src/vt/vrt/collection/balance/workload_replay.h similarity index 93% rename from src/vt/vrt/collection/balance/stats_replay.h rename to src/vt/vrt/collection/balance/workload_replay.h index dcc100c3ab..cb239d5e05 100644 --- a/src/vt/vrt/collection/balance/stats_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// stats_replay.h +// workload_replay.h // DARMA Toolkit v. 1.0.0 // DARMA/vt => Virtual Transport // @@ -42,8 +42,8 @@ //@HEADER */ -#if !defined INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H -#define INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H +#if !defined INCLUDED_VT_VRT_COLLECTION_BALANCE_WORKLOAD_REPLAY_H +#define INCLUDED_VT_VRT_COLLECTION_BALANCE_WORKLOAD_REPLAY_H #include "vt/config.h" #include "vt/elm/elm_id.h" @@ -103,4 +103,4 @@ struct WorkloadDataMigrator : lb::BaseLB { }}}} /* end namespace vt::vrt::collection::balance */ -#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_STATS_REPLAY_H*/ +#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_WORKLOAD_REPLAY_H*/ diff --git a/tests/unit/collection/test_lb_stats_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc similarity index 99% rename from tests/unit/collection/test_lb_stats_migrator.cc rename to tests/unit/collection/test_workload_data_migrator.cc index 1a1c812c42..3a1308cb4f 100644 --- a/tests/unit/collection/test_lb_stats_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// test_lb_stats_migrator.cc +// test_workload_data_migrator.cc // DARMA/vt => Virtual Transport // // Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC @@ -51,7 +51,7 @@ #include "vt/vrt/collection/balance/lb_common.h" #include "vt/vrt/collection/balance/stats_data.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" -#include "vt/vrt/collection/balance/stats_replay.h" +#include "vt/vrt/collection/balance/workload_replay.h" #include "vt/vrt/collection/balance/model/proposed_reassignment.h" #if vt_check_enabled(lblite) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 674a291547..2dfcb01d4a 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -32,4 +32,4 @@ macro(add_tool tool_name) # endif() endmacro() -add_subdirectory(stats_replay) +add_subdirectory(workload_replay) diff --git a/tools/stats_replay/CMakeLists.txt b/tools/stats_replay/CMakeLists.txt deleted file mode 100644 index a7bf5f104e..0000000000 --- a/tools/stats_replay/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ - -set( - STATS_REPLAY_TOOLS - simulate_replay -) - -foreach(TOOL_NAME ${STATS_REPLAY_TOOLS}) - # message("Example: building stats replay tool >>>>> ${TOOL_NAME}") - - add_tool(${TOOL_NAME}) -endforeach() diff --git a/tools/workload_replay/CMakeLists.txt b/tools/workload_replay/CMakeLists.txt new file mode 100644 index 0000000000..c9480ff0a9 --- /dev/null +++ b/tools/workload_replay/CMakeLists.txt @@ -0,0 +1,11 @@ + +set( + WORKLOAD_REPLAY_TOOLS + simulate_replay +) + +foreach(TOOL_NAME ${WORKLOAD_REPLAY_TOOLS}) + # message("Example: building workload replay tool >>>>> ${TOOL_NAME}") + + add_tool(${TOOL_NAME}) +endforeach() diff --git a/tools/stats_replay/simulate_replay.cc b/tools/workload_replay/simulate_replay.cc similarity index 96% rename from tools/stats_replay/simulate_replay.cc rename to tools/workload_replay/simulate_replay.cc index 095a9a933a..f9ab1b6c73 100644 --- a/tools/stats_replay/simulate_replay.cc +++ b/tools/workload_replay/simulate_replay.cc @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// stats_replay_driver.cc +// simulate_replay.cc // DARMA Toolkit v. 1.0.0 // DARMA/vt => Virtual Transport // @@ -43,7 +43,7 @@ */ #include -#include +#include int main(int argc, char** argv) { using vt::PhaseType; From 10115a3615ddc3816e964868fc1cb7e33cf36d25 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 13:08:05 -0700 Subject: [PATCH 14/41] #1265: tests: reduce redundant code --- .../collection/test_workload_data_migrator.cc | 261 ++++++------------ 1 file changed, 92 insertions(+), 169 deletions(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 3a1308cb4f..9f4103f416 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -60,9 +60,14 @@ namespace vt { namespace tests { namespace unit { namespace reassignment { using namespace vt::tests::unit; +using vt::vrt::collection::balance::StatsData; +using vt::vrt::collection::balance::LoadModel; +using vt::vrt::collection::balance::ProposedReassignment; +using vt::vrt::collection::balance::WorkloadDataMigrator; + struct TestWorkloadDataMigrator : TestParallelHarness { }; -std::unique_ptr +std::shared_ptr setupWorkloads(PhaseType phase, size_t numElements) { auto const& this_node = vt::theContext()->getNode(); @@ -76,15 +81,59 @@ setupWorkloads(PhaseType phase, size_t numElements) { ); } - using vt::vrt::collection::balance::StatsData; - auto sd = std::make_unique(); + auto sd = std::make_shared(); for (auto&& elmID : myElemList) { double tval = elmID.id * 2; sd->node_data_[phase][elmID].whole_phase_load = tval; } - return std::move(sd); + return sd; +} + +std::shared_ptr +setupBaseModel(PhaseType phase, std::shared_ptr sd) { + auto base_load_model = vt::theLBManager()->getBaseLoadModel(); + // force it to use our json workloads, not anything it may have collected + base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + + vt::runInEpochCollective("updateLoads", [&]{ + base_load_model->updateLoads(phase); + }); + + return base_load_model; +} + +std::shared_ptr +shiftObjectsRight( + std::shared_ptr base_load_model, + vt::PhaseType phase +) { + std::shared_ptr new_model = nullptr; + + vt::runInEpochCollective("do shift", [&]{ + using vt::vrt::collection::balance::LBType; + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); + if (lb_reassignment != nullptr) { + vt_debug_print( + normal, replay, + "global_mig={}, depart={}, arrive={}\n", + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + new_model = std::make_shared( + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) + ); + } + }); + + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + return new_model; } @@ -95,20 +144,10 @@ TEST_F(TestWorkloadDataMigrator, test_normalize_call) { PhaseType phase = 0; const size_t numElements = 5; - using vt::vrt::collection::balance::StatsData; auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); - auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); - - vt::runInEpochCollective("updateLoads", [&]{ - base_load_model->updateLoads(phase); - }); - - using vt::vrt::collection::balance::WorkloadDataMigrator; vt::objgroup::proxy::Proxy norm_lb_proxy; - using vt::vrt::collection::balance::ProposedReassignment; std::shared_ptr new_model = nullptr; // choose a set of migrations for the load model to represent @@ -156,45 +195,16 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_home) { PhaseType phase = 0; const size_t numElements = 5; - using vt::vrt::collection::balance::StatsData; auto sd = setupWorkloads(phase, numElements); - - auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); - - vt::runInEpochCollective("updateLoads", [&]{ - base_load_model->updateLoads(phase); - }); - - using vt::vrt::collection::balance::WorkloadDataMigrator; - using vt::vrt::collection::balance::ProposedReassignment; - using vt::vrt::collection::balance::LBType; - using ObjIDType = vt::elm::ElementIDStruct; - std::shared_ptr not_home_model = nullptr; + auto base_load_model = setupBaseModel(phase, sd); // move everything off the home node - vt::runInEpochCollective("do shift", [&]{ - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); - if (lb_reassignment != nullptr) { - fmt::print( - "{}: global_mig={}, depart={}, arrive={}\n", - lb_reassignment->node_, - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - not_home_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - } - }); - runInEpochCollective("destroy lb", [&]{ - vt::theLBManager()->destroyLB(); - }); + std::shared_ptr not_home_model = shiftObjectsRight( + base_load_model, phase + ); // list nothing as here so that we skip the optimization + using ObjIDType = vt::elm::ElementIDStruct; std::set no_migratable_objects_here; vt::objgroup::proxy::Proxy norm_lb_proxy; @@ -236,53 +246,24 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { PhaseType phase = 0; const size_t numElements = 5; - using vt::vrt::collection::balance::StatsData; auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); - auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); - - vt::runInEpochCollective("updateLoads", [&]{ - base_load_model->updateLoads(phase); - }); - - using vt::vrt::collection::balance::WorkloadDataMigrator; - using vt::vrt::collection::balance::ProposedReassignment; - using vt::vrt::collection::balance::LBType; + // move everything off the home node + std::shared_ptr not_home_model = shiftObjectsRight( + base_load_model, phase + ); using ObjIDType = vt::elm::ElementIDStruct; std::set migratable_objects_here; - std::shared_ptr not_home_model = nullptr; - - // move everything off the home node - vt::runInEpochCollective("do shift", [&]{ - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); - if (lb_reassignment != nullptr) { - fmt::print( - "{}: global_mig={}, depart={}, arrive={}\n", - lb_reassignment->node_, - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - not_home_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - for (auto it = not_home_model->begin(); it.isValid(); ++it) { - if ((*it).isMigratable()) { - // only claim a subset of them are here (relates to an optimization in - // the code being tested) - if ((*it).id % 3 == 0) { - migratable_objects_here.insert(*it); - } - } + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + // only claim a subset of them are here (relates to an optimization in + // the code being tested) + if ((*it).id % 3 == 0) { + migratable_objects_here.insert(*it); } } - }); - runInEpochCollective("destroy lb", [&]{ - vt::theLBManager()->destroyLB(); - }); + } vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr back_home_if_not_here_model = nullptr; @@ -329,49 +310,20 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { PhaseType phase = 0; const size_t numElements = 5; - using vt::vrt::collection::balance::StatsData; auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); - auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); - - vt::runInEpochCollective("updateLoads", [&]{ - base_load_model->updateLoads(phase); - }); - - using vt::vrt::collection::balance::WorkloadDataMigrator; - using vt::vrt::collection::balance::ProposedReassignment; - using vt::vrt::collection::balance::LBType; + // move everything off the home node + std::shared_ptr not_home_model = shiftObjectsRight( + base_load_model, phase + ); using ObjIDType = vt::elm::ElementIDStruct; std::set migratable_objects_here; - std::shared_ptr not_home_model = nullptr; - - // move everything off the home node - vt::runInEpochCollective("do shift", [&]{ - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); - if (lb_reassignment != nullptr) { - fmt::print( - "{}: global_mig={}, depart={}, arrive={}\n", - lb_reassignment->node_, - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - not_home_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - for (auto it = not_home_model->begin(); it.isValid(); ++it) { - if ((*it).isMigratable()) { - migratable_objects_here.insert(*it); - } - } + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); } - }); - runInEpochCollective("destroy lb", [&]{ - vt::theLBManager()->destroyLB(); - }); + } vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr here_model = nullptr; @@ -413,53 +365,24 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { PhaseType phase = 0; const size_t numElements = 5; - using vt::vrt::collection::balance::StatsData; auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); - auto base_load_model = vt::theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); - - vt::runInEpochCollective("updateLoads", [&]{ - base_load_model->updateLoads(phase); - }); - - using vt::vrt::collection::balance::WorkloadDataMigrator; - using vt::vrt::collection::balance::ProposedReassignment; - using vt::vrt::collection::balance::LBType; + // move everything off the home node + std::shared_ptr not_home_model = shiftObjectsRight( + base_load_model, phase + ); using ObjIDType = vt::elm::ElementIDStruct; std::set migratable_objects_here; - std::shared_ptr not_home_model = nullptr; - - // move everything off the home node - vt::runInEpochCollective("do shift", [&]{ - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); - if (lb_reassignment != nullptr) { - fmt::print( - "{}: global_mig={}, depart={}, arrive={}\n", - lb_reassignment->node_, - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - not_home_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - for (auto it = not_home_model->begin(); it.isValid(); ++it) { - if ((*it).isMigratable()) { - // only claim a subset of them are here (relates to an optimization in - // the code being tested) - if ((*it).id % 3 == 0) { - migratable_objects_here.insert(*it); - } - } + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + // only claim a subset of them are here (relates to an optimization in + // the code being tested) + if ((*it).id % 3 == 0) { + migratable_objects_here.insert(*it); } } - }); - runInEpochCollective("destroy lb", [&]{ - vt::theLBManager()->destroyLB(); - }); + } vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr here_model = nullptr; From bfc3a0ccbeaf0db359a57c85870ea98439d78d9b Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 13:25:13 -0700 Subject: [PATCH 15/41] #1265: replay: refactor to improve readability --- .../vrt/collection/balance/workload_replay.cc | 101 ++++++++++++------ .../vrt/collection/balance/workload_replay.h | 18 ++++ 2 files changed, 86 insertions(+), 33 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index d1fd12f7b3..db43522366 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -130,41 +130,15 @@ void replayWorkloads( ); } - // at the beginning of this phase, objects will exist in the locations - // they were placed by the previous lb invocation; this will be the - // arriving node for the purposes of this load model; that location - // is known by both the rank at which the lb placed the object and the - // rank from which the lb removed the object; the curr_node member of - // the object ids in the lb_reassignment object refers to the pre-lb - // location on the previous phase, but the curr_node member for our new - // load model must point to where the workloads data exists for this phase - - // the workloads data for this phase can exist at arbitrary locations; the - // only rank to know the location of this data is the one that has it; - // this will be the departing node for the purposes of this load model; - // we need to make sure the curr_node member of the object ids in our - // new load model points to the node on which the workloads data lives - - runInEpochCollective("WorkloadReplayDriver -> migrateStatsDataHome", [&] { - auto norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); - auto normalizer = norm_lb_proxy.get(); - pre_lb_load_model = normalizer->createModelToMoveWorkloadsHome( - base_load_model, migratable_objects_here - ); - norm_lb_proxy.destroyCollective(); - }); - theLBManager()->setLoadModel(pre_lb_load_model); - pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); + // get the workloads to the ranks where the objects currently exist + pre_lb_load_model = WorkloadDataMigrator::relocateWorkloadsForReplay( + base_load_model, migratable_objects_here + ); - runInEpochCollective("WorkloadReplayDriver -> migrateStatsDataHere", [&] { - auto norm_lb_proxy = WorkloadDataMigrator::construct(pre_lb_load_model); - auto normalizer = norm_lb_proxy.get(); - pre_lb_load_model = normalizer->createModelToMoveWorkloadsHere( - pre_lb_load_model, migratable_objects_here - ); - norm_lb_proxy.destroyCollective(); - }); + // update the load model that will be used by the real load balancer theLBManager()->setLoadModel(pre_lb_load_model); + + // force it to use our json workloads, not anything it may have collected pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); } @@ -308,6 +282,67 @@ WorkloadDataMigrator::readInWorkloads(std::string filename) { return sd; } +/*static*/ +std::shared_ptr +WorkloadDataMigrator::relocateWorkloadsForReplay( + std::shared_ptr model_base, + std::set migratable_objects_here +) { + // Object workloads may exist on arbitrary ranks instead of being colocated + // with the objects themselves. Relocate the workloads to where the objects + // themselves exist. Do this by first migrating home all workloads that are + // neither at home nor colocated with the object. Finally, migrate from home + // all workloads not already colocated with the object. + + std::shared_ptr move_home_model = + relocateMisplacedWorkloadsHome(model_base, migratable_objects_here); + + std::shared_ptr move_here_model = + relocateMisplacedWorkloadsHere(move_home_model, migratable_objects_here); + + return move_here_model; +} + +/*static*/ +std::shared_ptr +WorkloadDataMigrator::relocateMisplacedWorkloadsHome( + std::shared_ptr model_base, + std::set migratable_objects_here +) { + std::shared_ptr move_home_model = nullptr; + + runInEpochCollective("WorkloadDataMigrator -> migrateStatsDataHome", [&] { + auto norm_lb_proxy = WorkloadDataMigrator::construct(model_base); + auto normalizer = norm_lb_proxy.get(); + move_home_model = normalizer->createModelToMoveWorkloadsHome( + model_base, migratable_objects_here + ); + norm_lb_proxy.destroyCollective(); + }); + + return move_home_model; +} + +/*static*/ +std::shared_ptr +WorkloadDataMigrator::relocateMisplacedWorkloadsHere( + std::shared_ptr model_base, + std::set migratable_objects_here +) { + std::shared_ptr move_here_model = nullptr; + + runInEpochCollective("WorkloadDataMigrator -> migrateStatsDataHere", [&] { + auto norm_lb_proxy = WorkloadDataMigrator::construct(model_base); + auto normalizer = norm_lb_proxy.get(); + move_here_model = normalizer->createModelToMoveWorkloadsHere( + model_base, migratable_objects_here + ); + norm_lb_proxy.destroyCollective(); + }); + + return move_here_model; +} + std::shared_ptr WorkloadDataMigrator::createModelToMoveWorkloadsHome( std::shared_ptr model_base, diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index cb239d5e05..dbfee50789 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -88,6 +88,24 @@ struct WorkloadDataMigrator : lb::BaseLB { static std::shared_ptr readInWorkloads(std::string filename); + static std::shared_ptr + relocateWorkloadsForReplay( + std::shared_ptr model_base, + std::set migratable_objects_here + ); + + static std::shared_ptr + relocateMisplacedWorkloadsHome( + std::shared_ptr model_base, + std::set migratable_objects_here + ); + + static std::shared_ptr + relocateMisplacedWorkloadsHere( + std::shared_ptr model_base, + std::set migratable_objects_here + ); + std::shared_ptr createModelToMoveWorkloadsHome( std::shared_ptr model_base, From 31cea6131ebb3d46ed6b74ffcf14377fa11d88b4 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 13:59:59 -0700 Subject: [PATCH 16/41] #1265: tests: leverage refactor in testing --- .../collection/test_workload_data_migrator.cc | 62 +++---------------- 1 file changed, 10 insertions(+), 52 deletions(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 9f4103f416..0fab8d6719 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -207,21 +207,11 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_home) { using ObjIDType = vt::elm::ElementIDStruct; std::set no_migratable_objects_here; - vt::objgroup::proxy::Proxy norm_lb_proxy; - std::shared_ptr back_home_model = nullptr; - // then create a load model that restores them to homes - vt::runInEpochCollective("migrate workloads home", [&]{ - norm_lb_proxy = WorkloadDataMigrator::construct(not_home_model); - auto normalizer = norm_lb_proxy.get(); - - back_home_model = normalizer->createModelToMoveWorkloadsHome( + std::shared_ptr back_home_model = + WorkloadDataMigrator::relocateMisplacedWorkloadsHome( not_home_model, no_migratable_objects_here ); - }); - runInEpochCollective("destroy migrator", [&]{ - norm_lb_proxy.destroyCollective(); - }); // then iterate over it to make sure what shows up here is correct for (auto obj_id : *back_home_model) { @@ -265,21 +255,11 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { } } - vt::objgroup::proxy::Proxy norm_lb_proxy; - std::shared_ptr back_home_if_not_here_model = nullptr; - // then create a load model that restores them to homes - vt::runInEpochCollective("migrate workloads home", [&]{ - norm_lb_proxy = WorkloadDataMigrator::construct(not_home_model); - auto normalizer = norm_lb_proxy.get(); - - back_home_if_not_here_model = normalizer->createModelToMoveWorkloadsHome( + std::shared_ptr back_home_if_not_here_model = + WorkloadDataMigrator::relocateMisplacedWorkloadsHome( not_home_model, migratable_objects_here ); - }); - runInEpochCollective("destroy migrator", [&]{ - norm_lb_proxy.destroyCollective(); - }); // then iterate over it to make sure what shows up here is correct for (auto obj_id : *back_home_if_not_here_model) { @@ -325,22 +305,11 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { } } - vt::objgroup::proxy::Proxy norm_lb_proxy; - std::shared_ptr here_model = nullptr; - - // then create a load model that pulls loads here from home, - // based on the base load model, not the one we just created - vt::runInEpochCollective("migrate workloads here", [&]{ - norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); - auto normalizer = norm_lb_proxy.get(); - - here_model = normalizer->createModelToMoveWorkloadsHere( + // then create a load model that restores them to homes + std::shared_ptr here_model = + WorkloadDataMigrator::relocateMisplacedWorkloadsHere( base_load_model, migratable_objects_here ); - }); - runInEpochCollective("destroy migrator", [&]{ - norm_lb_proxy.destroyCollective(); - }); // then iterate over it to make sure what shows up here is correct for (auto obj_id : *here_model) { @@ -384,22 +353,11 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { } } - vt::objgroup::proxy::Proxy norm_lb_proxy; - std::shared_ptr here_model = nullptr; - - // then create a load model that pulls loads here from home, - // based on the base load model, not the one we just created - vt::runInEpochCollective("migrate workloads here", [&]{ - norm_lb_proxy = WorkloadDataMigrator::construct(base_load_model); - auto normalizer = norm_lb_proxy.get(); - - here_model = normalizer->createModelToMoveWorkloadsHere( + // then create a load model that restores them to homes + std::shared_ptr here_model = + WorkloadDataMigrator::relocateMisplacedWorkloadsHere( base_load_model, migratable_objects_here ); - }); - runInEpochCollective("destroy migrator", [&]{ - norm_lb_proxy.destroyCollective(); - }); // then iterate over it to make sure what shows up here is correct for (auto obj_id : *here_model) { From 35f05dabf17df6f1b3c5258e7655c7144476d88f Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 14:49:09 -0700 Subject: [PATCH 17/41] #1265: replay: clean up code --- .../vrt/collection/balance/workload_replay.cc | 120 +++++++++--------- 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index db43522366..81e38633e9 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -60,32 +60,30 @@ void replayWorkloads( ) { using ObjIDType = elm::ElementIDStruct; + auto const this_rank = theContext()->getNode(); + // read in object loads from json files auto const filename = theConfig()->getLBStatsFileIn(); auto sd = WorkloadDataMigrator::readInWorkloads(filename); // remember vt's base load model auto base_load_model = theLBManager()->getBaseLoadModel(); - - // allow remembering the migrations suggested by the load balancer - std::shared_ptr lb_reassignment = nullptr; - - // allow remembering what objects are here after the load balancer migrates - std::set migratable_objects_here; // force it to use our json workloads, not anything it may have collected base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); // point the load model at the workloads for the relevant phase runInEpochCollective("WorkloadReplayDriver -> updateLoads", [=] { base_load_model->updateLoads(initial_phase); }); - for (auto stat_obj_id : *base_load_model) { - if (stat_obj_id.isMigratable()) { - migratable_objects_here.insert(stat_obj_id); + + // allow remembering what objects are here after the load balancer migrates + std::set migratable_objects_here; + for (auto workload_id : *base_load_model) { + if (workload_id.isMigratable()) { + migratable_objects_here.insert(workload_id); } } // simulate the requested number of phases - auto const this_rank = theContext()->getNode(); auto stop_phase = initial_phase + phases_to_run; for (PhaseType phase = initial_phase; phase < stop_phase; phase++) { // reapply the base load model if in case we overwrote it on a previous iter @@ -99,34 +97,32 @@ void replayWorkloads( base_load_model->updateLoads(phase); }); - size_t count = 0; - for (auto stat_obj_id : *base_load_model) { - if (stat_obj_id.isMigratable()) { - ++count; - vt_debug_print( - normal, replay, - "workloads for id {} are here on phase {}\n", - stat_obj_id, phase - ); + if (theConfig()->vt_debug_replay) { + size_t count = 0; + for (auto workload_id : *base_load_model) { + if (workload_id.isMigratable()) { + ++count; + vt_debug_print( + normal, replay, + "workload for element {} is here on phase {}\n", workload_id, phase + ); + } } + vt_debug_print( + terse, replay, + "Number of known workloads: {}\n", count + ); } - // sanity output - vt_debug_print( - terse, replay, - "Stats num objects: {}\n", count - ); auto pre_lb_load_model = base_load_model; - // if this isn't the initial phase, then the workloads may exist on a rank + // if this isn't the initial phase, then the workload may exist on a rank // other than where the objects are currently meant to exist; we will // use a Reassignment object to get those workloads where they need to be if (phase > initial_phase) { if (this_rank == 0) { vt_print( - replay, - "Migrating imported object workloads to phase {} ranks...\n", - phase + replay, "Migrating object workloads to phase {} ranks...\n", phase ); } @@ -146,22 +142,22 @@ void replayWorkloads( vt_print(replay, "Simulating phase {}...\n", phase); } - // sanity output - count = 0; - for (auto stat_obj_id : *pre_lb_load_model) { - if (stat_obj_id.isMigratable()) { - ++count; - vt_debug_print( - normal, replay, - "element {} is here on phase {} pre-lb\n", - stat_obj_id, phase - ); + if (theConfig()->vt_debug_replay) { + size_t count = 0; + for (auto workload_id : *pre_lb_load_model) { + if (workload_id.isMigratable()) { + ++count; + vt_debug_print( + normal, replay, + "element {} is here on phase {} before LB\n", workload_id, phase + ); + } } + vt_debug_print( + terse, replay, + "Number of objects before LB: {}\n", count + ); } - vt_debug_print( - terse, replay, - "Pre-lb num objects: {}\n", count - ); vt_debug_print( terse, replay, @@ -171,29 +167,27 @@ void replayWorkloads( runInEpochCollective("WorkloadReplayDriver -> runRealLB", [&] { // run the load balancer but don't let it automatically migrate; // instead, remember where the LB wanted to migrate objects - lb_reassignment = theLBManager()->selectStartLB(phase); + auto lb_reassignment = theLBManager()->selectStartLB(phase); if (lb_reassignment) { auto proposed_model = std::make_shared( - pre_lb_load_model, lb_reassignment + pre_lb_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) ); migratable_objects_here.clear(); for (auto it = proposed_model->begin(); it.isValid(); ++it) { if ((*it).isMigratable()) { - ObjIDType loc_id = *it; - loc_id.curr_node = this_rank; - migratable_objects_here.insert(loc_id); + migratable_objects_here.insert(*it); vt_debug_print( normal, replay, - "element {} is here on phase {} post-lb\n", - loc_id, phase + "element {} is here on phase {} after LB\n", *it, phase ); } } } vt_debug_print( terse, replay, - "Post-lb num objects: {}\n", migratable_objects_here.size() + "Number of objects after LB: {}\n", migratable_objects_here.size() ); }); runInEpochCollective("WorkloadReplayDriver -> destroyLB", [&] { @@ -355,19 +349,19 @@ WorkloadDataMigrator::createModelToMoveWorkloadsHome( ); runInEpochCollective("WorkloadDataMigrator -> transferStatsHome", [&] { - for (auto stat_obj_id : *model_base) { - if (stat_obj_id.isMigratable()) { + for (auto workload_id : *model_base) { + if (workload_id.isMigratable()) { // if the object belongs here, do nothing; otherwise, "transfer" it to // the home rank so that it can later be sent to the rank holding the // object - if (stat_obj_id.getHomeNode() != this_rank) { - if (migratable_objects_here.count(stat_obj_id) == 0) { + if (workload_id.getHomeNode() != this_rank) { + if (migratable_objects_here.count(workload_id) == 0) { vt_debug_print( verbose, replay, "will transfer load of {} home to {}\n", - stat_obj_id, stat_obj_id.getHomeNode() + workload_id, workload_id.getHomeNode() ); - migrateObjectTo(stat_obj_id, stat_obj_id.getHomeNode()); + migrateObjectTo(workload_id, workload_id.getHomeNode()); } } } @@ -391,28 +385,28 @@ WorkloadDataMigrator::createModelToMoveWorkloadsHere( ); runInEpochCollective("WorkloadDataMigrator -> transferStatsHere", [&] { - for (auto stat_obj_id : migratable_objects_here) { + for (auto workload_id : migratable_objects_here) { // if the object is already here, do nothing; otherwise, "transfer" it - // from the home rank so that we will have the needed workloads data + // from the home rank so that we will have the needed workload data bool workloads_here = false; for (auto other_id : *model_base) { - if (stat_obj_id == other_id) { + if (workload_id == other_id) { workloads_here = true; break; } } if (!workloads_here) { // check that this isn't something that should already have been here - assert(stat_obj_id.getHomeNode() != this_rank); + assert(workload_id.getHomeNode() != this_rank); vt_debug_print( verbose, replay, "will transfer load of {} from home {}\n", - stat_obj_id, stat_obj_id.getHomeNode() + workload_id, workload_id.getHomeNode() ); - ObjIDType mod_id = stat_obj_id; + ObjIDType mod_id = workload_id; // Override curr_node to force retrieval from the home rank - mod_id.curr_node = stat_obj_id.getHomeNode(); + mod_id.curr_node = workload_id.getHomeNode(); migrateObjectTo(mod_id, this_rank); } } From b693bbefdb6e9cd8ba55007e6ea6d38eb9a93098 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 23 Mar 2022 15:07:15 -0700 Subject: [PATCH 18/41] #1265: tests: add more replay tests --- .../collection/test_workload_data_migrator.cc | 126 +++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 0fab8d6719..5682164750 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -136,6 +136,38 @@ shiftObjectsRight( return new_model; } +std::shared_ptr +shiftObjectsRandomly( + std::shared_ptr base_load_model, + vt::PhaseType phase +) { + std::shared_ptr new_model = nullptr; + + vt::runInEpochCollective("do shift", [&]{ + using vt::vrt::collection::balance::LBType; + auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RandomLB); + if (lb_reassignment != nullptr) { + vt_debug_print( + normal, replay, + "global_mig={}, depart={}, arrive={}\n", + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + new_model = std::make_shared( + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) + ); + } + }); + + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + return new_model; +} + TEST_F(TestWorkloadDataMigrator, test_normalize_call) { auto const& this_node = vt::theContext()->getNode(); @@ -353,7 +385,7 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { } } - // then create a load model that restores them to homes + // then create a load model that brings them here std::shared_ptr here_model = WorkloadDataMigrator::relocateMisplacedWorkloadsHere( base_load_model, migratable_objects_here @@ -382,6 +414,98 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { } } +TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_1) { + auto const& this_node = vt::theContext()->getNode(); + + PhaseType phase = 0; + const size_t numElements = 5; + + auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); + + // shift the workloads to not be home + std::shared_ptr workloads_not_home_model = + shiftObjectsRight(base_load_model, phase); + + // put the objects whereever + std::shared_ptr objects_whereever_model = + shiftObjectsRandomly(base_load_model, phase); + using ObjIDType = vt::elm::ElementIDStruct; + std::set migratable_objects_here; + for (auto it = objects_whereever_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); + } + } + + // then create a load model that matches everything up + std::shared_ptr here_model = + WorkloadDataMigrator::relocateWorkloadsForReplay( + workloads_not_home_model, migratable_objects_here + ); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *here_model) { + if (obj_id.isMigratable()) { + EXPECT_EQ(migratable_objects_here.count(obj_id), 1); + + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + +TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { + auto const& this_node = vt::theContext()->getNode(); + + PhaseType phase = 0; + const size_t numElements = 5; + + auto sd = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, sd); + + // put the workloads whereever + std::shared_ptr workloads_whereever_model = + shiftObjectsRandomly(base_load_model, phase); + + // shift the objects so they aren't at home + std::shared_ptr objects_not_home_model = + shiftObjectsRight(base_load_model, phase); + using ObjIDType = vt::elm::ElementIDStruct; + std::set migratable_objects_here; + for (auto it = objects_not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); + } + } + + // then create a load model that matches everything up + std::shared_ptr here_model = + WorkloadDataMigrator::relocateWorkloadsForReplay( + workloads_whereever_model, migratable_objects_here + ); + + // then iterate over it to make sure what shows up here is correct + for (auto obj_id : *here_model) { + if (obj_id.isMigratable()) { + EXPECT_EQ(migratable_objects_here.count(obj_id), 1); + + EXPECT_EQ(obj_id.getCurrNode(), this_node); + + using vt::vrt::collection::balance::PhaseOffset; + auto load = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + EXPECT_EQ(load, obj_id.id * 2); + } + } +} + }}}} // end namespace vt::tests::unit::reassignment #endif /*vt_check_enabled(lblite)*/ From 3f07a727f929f39f77c5cdcc3ad9609a8b31110e Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 11:45:15 -0700 Subject: [PATCH 19/41] #1265: replay: update license headers --- scripts/check_license.sh | 2 +- src/vt/vrt/collection/balance/workload_replay.cc | 5 ++--- src/vt/vrt/collection/balance/workload_replay.h | 5 ++--- tests/unit/collection/test_workload_data_migrator.cc | 2 +- tools/workload_replay/simulate_replay.cc | 5 ++--- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/scripts/check_license.sh b/scripts/check_license.sh index 2c573ad9c0..8ab58bad59 100755 --- a/scripts/check_license.sh +++ b/scripts/check_license.sh @@ -3,7 +3,7 @@ path_to_vt=${1} cd "$path_to_vt" || exit 1 -for sub_dir in "src" "tests/unit" "tests/perf" "tutorial" "examples" +for sub_dir in "src" "tests/unit" "tests/perf" "tutorial" "examples" "tools" do "$path_to_vt/scripts/add-license-perl.pl" "$path_to_vt/$sub_dir" "$path_to_vt/scripts/license-template" done diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 81e38633e9..86dc96e7ec 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -2,11 +2,10 @@ //@HEADER // ***************************************************************************** // -// workload_replay.cc -// DARMA Toolkit v. 1.0.0 +// workload_replay.cc // DARMA/vt => Virtual Transport // -// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC // (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. // Government retains certain rights in this software. // diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index dbfee50789..62ea6ef64c 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -2,11 +2,10 @@ //@HEADER // ***************************************************************************** // -// workload_replay.h -// DARMA Toolkit v. 1.0.0 +// workload_replay.h // DARMA/vt => Virtual Transport // -// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC // (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. // Government retains certain rights in this software. // diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 5682164750..5c60eaf83d 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// test_workload_data_migrator.cc +// test_workload_data_migrator.cc // DARMA/vt => Virtual Transport // // Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC diff --git a/tools/workload_replay/simulate_replay.cc b/tools/workload_replay/simulate_replay.cc index f9ab1b6c73..51ab3ff1ef 100644 --- a/tools/workload_replay/simulate_replay.cc +++ b/tools/workload_replay/simulate_replay.cc @@ -2,11 +2,10 @@ //@HEADER // ***************************************************************************** // -// simulate_replay.cc -// DARMA Toolkit v. 1.0.0 +// simulate_replay.cc // DARMA/vt => Virtual Transport // -// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC // (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. // Government retains certain rights in this software. // From 39112db44efe421e35473827a95f9607b455ea20 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 12:47:01 -0700 Subject: [PATCH 20/41] #1265: replay: add doxygen --- .../vrt/collection/balance/workload_replay.h | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 62ea6ef64c..b953a289b0 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -58,16 +58,45 @@ namespace vt { namespace vrt { namespace collection { namespace balance { +/** + * \brief Simulate replaying the object workloads as recorded in the json file, + * but allow new load balancing decisions to be made. + * + * \param[in] initial_phase the first phase to replay + * \param[in] phases_to_run how many phases to replay + * + * The json files specified by the command-line arguments --vt_lb_stats_file_in + * and --vt_lb_stats_dir_in will be imported and the LB data contained within + * will be fed through the specified load balancer(s) on each requested phase, + * allowing new load balancing migrations to happen. There is no requirement to + * colocate the LB data on the same rank as the object exists during any given + * phase. + */ void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run ); +/** + * \struct WorkloadDataMigrator + * + * \brief A helper objgroup for workload replay. Derives from + * \c vt::Vrt::collection::lb::BaseLB in order to gain access to + * normalizeReassignments but is not a load balancer in the traditional sense. + * A new instance should be created for each call to normalizeReassignments. + */ struct WorkloadDataMigrator : lb::BaseLB { using ObjIDType = elm::ElementIDStruct; WorkloadDataMigrator() = default; + /** + * \brief Construct an objgroup and configure it + * + * \param[in] model_base the load model that reflects the known workloads + * + * \return the objgroup proxy to use for exchanging workload information + */ static objgroup::proxy::Proxy construct(std::shared_ptr model_base); @@ -79,38 +108,95 @@ struct WorkloadDataMigrator : lb::BaseLB { using BaseLB::normalizeReassignments; + /** + * \brief Update the current locations of objects so that ProposedReassignment + * load models can be composed + * + * \param[in] lb_reassignment the Reassignment returned by a load balancer + * + * \return a new Reassignment that reflects the updated locations of objects + */ static std::shared_ptr updateCurrentNodes( std::shared_ptr lb_reassignment ); + /** + * \brief Build a StatsData object from the LB data in a json file + * + * \param[in] filename read in LB data from the specified json file + * + * \return the StatsData object built from the LB data + */ static std::shared_ptr readInWorkloads(std::string filename); + /** + * \brief Relocate object workloads to the rank where the objects are supposed + * to exist during this phase + * + * \param[in] model_base the load model for the phase we are simulating + * \param[in] migratable_objects_here migratable objects here on this phase + * + * \return load model that makes the necessary object workloads available + */ static std::shared_ptr relocateWorkloadsForReplay( std::shared_ptr model_base, std::set migratable_objects_here ); + /** + * \brief Instantiate objgroup and relocate applicable object workloads home + * + * \param[in] model_base the load model for the phase we are simulating + * \param[in] migratable_objects_here migratable objects here on this phase + * + * \return load model that makes the necessary object workloads available + */ static std::shared_ptr relocateMisplacedWorkloadsHome( std::shared_ptr model_base, std::set migratable_objects_here ); + /** + * \brief Instantiate objgroup and relocate applicable workloads here + * + * \param[in] model_base the load model for the phase we are simulating + * \param[in] migratable_objects_here migratable objects here on this phase + * + * \return load model that makes the necessary object workloads available + */ static std::shared_ptr relocateMisplacedWorkloadsHere( std::shared_ptr model_base, std::set migratable_objects_here ); +private: + /** + * \brief Relocate object workloads home if the object is not on this rank + * + * \param[in] model_base the load model for the phase we are simulating + * \param[in] migratable_objects_here migratable objects here on this phase + * + * \return load model that makes the necessary object workloads available + */ std::shared_ptr createModelToMoveWorkloadsHome( std::shared_ptr model_base, std::set migratable_objects_here ); + /** + * \brief Relocate workloads here for objects on this rank + * + * \param[in] model_base the load model for the phase we are simulating + * \param[in] migratable_objects_here migratable objects here on this phase + * + * \return load model that makes the necessary object workloads available + */ std::shared_ptr createModelToMoveWorkloadsHere( std::shared_ptr model_base, From 9144b7b4685c0e3a2a0891db15b096efe35310f7 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 13:00:17 -0700 Subject: [PATCH 21/41] #1265: tests: add subphases to workload migrator test --- .../collection/test_workload_data_migrator.cc | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 5c60eaf83d..22ea03fec2 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -86,6 +86,9 @@ setupWorkloads(PhaseType phase, size_t numElements) { for (auto&& elmID : myElemList) { double tval = elmID.id * 2; sd->node_data_[phase][elmID].whole_phase_load = tval; + auto &subphase_loads = sd->node_data_[phase][elmID].subphase_loads; + subphase_loads.push_back(elmID.id % 2 ? tval : 0); + subphase_loads.push_back(elmID.id % 2 ? 0 : tval); } return sd; @@ -217,6 +220,14 @@ TEST_F(TestWorkloadDataMigrator, test_normalize_call) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = new_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = new_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -257,6 +268,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_home) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = back_home_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = back_home_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -311,6 +330,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = back_home_if_not_here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = back_home_if_not_here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -355,6 +382,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -410,6 +445,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -456,6 +499,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_1) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } @@ -502,6 +553,14 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); + auto subload0 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 0} + ); + EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); + auto subload1 = here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, 1} + ); + EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); } } } From 34e77ff8b745f412d2a85b9241944eaba834ba43 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 13:16:36 -0700 Subject: [PATCH 22/41] #1265: replay: allow in-memory testing --- .../vrt/collection/balance/workload_replay.cc | 33 ++++++++++++------- .../vrt/collection/balance/workload_replay.h | 26 ++++++++++++--- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 86dc96e7ec..68ffde9ec2 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -56,19 +56,26 @@ namespace balance { void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run +) { + // read in object loads from json files + auto const filename = theConfig()->getLBStatsFileIn(); + auto workloads = WorkloadDataMigrator::readInWorkloads(filename); + + replayWorkloads(initial_phase, phases_to_run, workloads); +} + +void replayWorkloads( + PhaseType initial_phase, PhaseType phases_to_run, + std::shared_ptr workloads ) { using ObjIDType = elm::ElementIDStruct; auto const this_rank = theContext()->getNode(); - // read in object loads from json files - auto const filename = theConfig()->getLBStatsFileIn(); - auto sd = WorkloadDataMigrator::readInWorkloads(filename); - // remember vt's base load model auto base_load_model = theLBManager()->getBaseLoadModel(); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); + // force it to use our given workloads, not anything it may have collected + base_load_model->setLoads(&(workloads->node_data_), &(workloads->node_comm_)); // point the load model at the workloads for the relevant phase runInEpochCollective("WorkloadReplayDriver -> updateLoads", [=] { base_load_model->updateLoads(initial_phase); @@ -82,14 +89,16 @@ void replayWorkloads( } } - // simulate the requested number of phases + // simulate the given number of phases auto stop_phase = initial_phase + phases_to_run; for (PhaseType phase = initial_phase; phase < stop_phase; phase++) { // reapply the base load model if in case we overwrote it on a previous iter theLBManager()->setLoadModel(base_load_model); - // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); + // force it to use our given workloads, not anything it may have collected + base_load_model->setLoads( + &(workloads->node_data_), &(workloads->node_comm_) + ); // point the load model at the workloads for the relevant phase runInEpochCollective("WorkloadReplayDriver -> updateLoads", [=] { @@ -133,8 +142,10 @@ void replayWorkloads( // update the load model that will be used by the real load balancer theLBManager()->setLoadModel(pre_lb_load_model); - // force it to use our json workloads, not anything it may have collected - pre_lb_load_model->setLoads(&(sd->node_data_), &(sd->node_comm_)); + // force it to use our given workloads, not anything it may have collected + pre_lb_load_model->setLoads( + &(workloads->node_data_), &(workloads->node_comm_) + ); } if (this_rank == 0) { diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index b953a289b0..e9aa27073c 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -67,15 +67,33 @@ namespace balance { * * The json files specified by the command-line arguments --vt_lb_stats_file_in * and --vt_lb_stats_dir_in will be imported and the LB data contained within - * will be fed through the specified load balancer(s) on each requested phase, - * allowing new load balancing migrations to happen. There is no requirement to - * colocate the LB data on the same rank as the object exists during any given - * phase. + * will be fed through the load balancer(s) specified on the vt command-line + * on each requested phase, allowing new load balancing decisions to happen. + * There is no requirement to colocate the LB data on the same rank as the + * object exists during any given phase. */ void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run ); +/** + * \brief Simulate replaying the object workloads passed in, but allow new load + * balancing decisions to be made. + * + * \param[in] initial_phase the first phase to replay + * \param[in] phases_to_run how many phases to replay + * \param[in] workloads the workload data to simulate + * + * LB data passed in will be fed through the load balancer(s) specified on the + * vt command-line on each requested phase, allowing new load balancing + * decisions to happen. There is no requirement to colocate the LB data on the + * same rank as the object exists during any given phase. + */ +void replayWorkloads( + PhaseType initial_phase, PhaseType phases_to_run, + std::shared_ptr workloads +); + /** * \struct WorkloadDataMigrator * From cb7d7625959e193592b28012b0af2b7d2e6026a6 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 13:49:18 -0700 Subject: [PATCH 23/41] #1265: tests: run replay without verifying --- .../collection/test_workload_data_migrator.cc | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 22ea03fec2..1cde632821 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -565,6 +565,99 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { } } +std::shared_ptr +setupManyWorkloads( + PhaseType initial_phase, PhaseType num_phases, size_t numElements +) { + auto const& this_node = vt::theContext()->getNode(); + + using vt::vrt::collection::balance::ElementIDStruct; + + std::vector myElemList(numElements); + + for (size_t ii = 0; ii < numElements; ++ii) { + myElemList[ii] = elm::ElmIDBits::createCollectionImpl( + true, ii+1, this_node, this_node + ); + } + + auto sd = std::make_shared(); + + PhaseType stop_phase = initial_phase + num_phases; + for (PhaseType phase = initial_phase; phase < stop_phase; ++phase) { + for (size_t ii = 0; ii < numElements; ++ii) { + auto elmID = myElemList[ii]; + double tval = this_node + (ii + 10) * 2; + sd->node_data_[phase][elmID].whole_phase_load = tval + phase; + auto &subphase_loads = sd->node_data_[phase][elmID].subphase_loads; + subphase_loads.push_back(elmID.id % 2 ? tval : phase); + subphase_loads.push_back(elmID.id % 2 ? phase : tval); + } + } + + auto scrambled_sd = std::make_shared(); + + for (PhaseType phase = initial_phase; phase < stop_phase; ++phase) { + auto base_load_model = setupBaseModel(phase, sd); + + std::shared_ptr not_home_model = + shiftObjectsRight(base_load_model, phase); + + std::set migratable_objects_here; + for (auto it = not_home_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); + } + } + + // then create a load model that matches everything up + std::shared_ptr here_model = + WorkloadDataMigrator::relocateWorkloadsForReplay( + not_home_model, migratable_objects_here + ); + + // then store them at their new locations + for (auto it = here_model->begin(); it.isValid(); ++it) { + auto obj_id = *it; + using vt::vrt::collection::balance::PhaseOffset; + scrambled_sd->node_data_[phase][obj_id].whole_phase_load = + here_model->getWork( + obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} + ); + scrambled_sd->node_data_[phase][*it].subphase_loads.push_back( + here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 0}) + ); + scrambled_sd->node_data_[phase][*it].subphase_loads.push_back( + here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 1}) + ); + } + } + + return scrambled_sd; +} + +struct TestWorkloadReplay : TestParallelHarness { +#if vt_check_enabled(lblite) + void addAdditionalArgs() override { + static char vt_lb[]{"--vt_lb"}; + static char vt_lb_name[]{"--vt_lb_name=RandomLB"}; + addArgs(vt_lb, vt_lb_name); + } +#endif +}; + +TEST_F(TestWorkloadReplay, test_run_replay_no_verify) { + PhaseType initial_phase = 1; + PhaseType num_phases = 3; + const size_t numElements = 5; + + // first set up the workloads to replay, moving them around by phase + auto sd = setupManyWorkloads(initial_phase, num_phases, numElements); + + // then replay them but allow the lb to place objects differently + replayWorkloads(initial_phase, num_phases, sd); +} + }}}} // end namespace vt::tests::unit::reassignment #endif /*vt_check_enabled(lblite)*/ From 3839d3000ef4d389dfc173ba1c6c3f58df57a0a1 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Fri, 25 Mar 2022 13:58:18 -0700 Subject: [PATCH 24/41] #1265: replay: clarify usage in tool --- tools/workload_replay/simulate_replay.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/workload_replay/simulate_replay.cc b/tools/workload_replay/simulate_replay.cc index 51ab3ff1ef..59de2c9f44 100644 --- a/tools/workload_replay/simulate_replay.cc +++ b/tools/workload_replay/simulate_replay.cc @@ -51,7 +51,9 @@ int main(int argc, char** argv) { vtAbortIf( argc != 3, - "Must have two arguments: " + "Must have two app-specific arguments: \n" + "The json workload files needs to be specified using\n" + "--vt_lb_stats_file_in and --vt_lb_stats_dir_in" ); // initial phase to simulate @@ -59,6 +61,8 @@ int main(int argc, char** argv) { // number of phases to simulate PhaseType phases_to_run = atoi(argv[2]); + // the workloads used will be those specified with the command-line arguments + // --vt_lb_stats_file_in and --vt_lb_stats_dir_in vt::vrt::collection::balance::replayWorkloads( initial_phase, phases_to_run ); From 3ec982e2fdcaa9056202f6478908707d80b30bd5 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 12 Apr 2022 14:02:07 -0700 Subject: [PATCH 25/41] #1265: replay: add replay namespace --- src/vt/vrt/collection/balance/workload_replay.cc | 4 ++-- src/vt/vrt/collection/balance/workload_replay.h | 4 ++-- tests/unit/collection/test_workload_data_migrator.cc | 10 ++++++---- tools/workload_replay/simulate_replay.cc | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 68ffde9ec2..2c22a99bd6 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -52,7 +52,7 @@ #include namespace vt { namespace vrt { namespace collection { -namespace balance { +namespace balance { namespace replay { void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run @@ -430,4 +430,4 @@ WorkloadDataMigrator::createModelToMoveWorkloadsHere( return std::make_shared(model_base, here_assignment); } -}}}} /* end namespace vt::vrt::collection::balance */ +}}}}} /* end namespace vt::vrt::collection::balance::replay */ diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index e9aa27073c..2003856e62 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -56,7 +56,7 @@ #include namespace vt { namespace vrt { namespace collection { -namespace balance { +namespace balance { namespace replay { /** * \brief Simulate replaying the object workloads as recorded in the json file, @@ -222,6 +222,6 @@ struct WorkloadDataMigrator : lb::BaseLB { ); }; -}}}} /* end namespace vt::vrt::collection::balance */ +}}}}} /* end namespace vt::vrt::collection::balance::replay */ #endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_WORKLOAD_REPLAY_H*/ diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 1cde632821..39738fcf60 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -56,14 +56,14 @@ #if vt_check_enabled(lblite) -namespace vt { namespace tests { namespace unit { namespace reassignment { +namespace vt { namespace tests { namespace unit { namespace replay { using namespace vt::tests::unit; using vt::vrt::collection::balance::StatsData; using vt::vrt::collection::balance::LoadModel; using vt::vrt::collection::balance::ProposedReassignment; -using vt::vrt::collection::balance::WorkloadDataMigrator; +using vt::vrt::collection::balance::replay::WorkloadDataMigrator; struct TestWorkloadDataMigrator : TestParallelHarness { }; @@ -655,9 +655,11 @@ TEST_F(TestWorkloadReplay, test_run_replay_no_verify) { auto sd = setupManyWorkloads(initial_phase, num_phases, numElements); // then replay them but allow the lb to place objects differently - replayWorkloads(initial_phase, num_phases, sd); + vt::vrt::collection::balance::replay::replayWorkloads( + initial_phase, num_phases, sd + ); } -}}}} // end namespace vt::tests::unit::reassignment +}}}} // end namespace vt::tests::unit::replay #endif /*vt_check_enabled(lblite)*/ diff --git a/tools/workload_replay/simulate_replay.cc b/tools/workload_replay/simulate_replay.cc index 59de2c9f44..ec2803cbaa 100644 --- a/tools/workload_replay/simulate_replay.cc +++ b/tools/workload_replay/simulate_replay.cc @@ -63,7 +63,7 @@ int main(int argc, char** argv) { // the workloads used will be those specified with the command-line arguments // --vt_lb_stats_file_in and --vt_lb_stats_dir_in - vt::vrt::collection::balance::replayWorkloads( + vt::vrt::collection::balance::replay::replayWorkloads( initial_phase, phases_to_run ); From a74cfc983cd7128afe40e7f4866f4a696c22e954 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 12 Apr 2022 14:05:32 -0700 Subject: [PATCH 26/41] #1265: replay: clean up code --- .../vrt/collection/balance/workload_replay.cc | 57 +++++++++---------- .../vrt/collection/balance/workload_replay.h | 21 +++---- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 2c22a99bd6..b57198cd38 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -59,7 +59,7 @@ void replayWorkloads( ) { // read in object loads from json files auto const filename = theConfig()->getLBStatsFileIn(); - auto workloads = WorkloadDataMigrator::readInWorkloads(filename); + auto workloads = readInWorkloads(filename); replayWorkloads(initial_phase, phases_to_run, workloads); } @@ -207,6 +207,33 @@ void replayWorkloads( } } +std::shared_ptr +readInWorkloads(const std::string &filename) { + using util::json::Reader; + + Reader r{filename}; + auto json = r.readFile(); + auto sd = std::make_shared(*json); + + for (auto &phase_data : sd->node_data_) { + vt_debug_print( + normal, replay, + "found {} loads for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + + for (auto &phase_data : sd->node_comm_) { + vt_debug_print( + normal, replay, + "found {} comms for phase {}\n", + phase_data.second.size(), phase_data.first + ); + } + + return sd; +} + /*static*/ objgroup::proxy::Proxy @@ -258,34 +285,6 @@ WorkloadDataMigrator::updateCurrentNodes( return modified_reassignment; } -/*static*/ -std::shared_ptr -WorkloadDataMigrator::readInWorkloads(std::string filename) { - using util::json::Reader; - - Reader r{filename}; - auto json = r.readFile(); - auto sd = std::make_shared(*json); - - for (auto &phase_data : sd->node_data_) { - vt_debug_print( - normal, replay, - "found {} loads for phase {}\n", - phase_data.second.size(), phase_data.first - ); - } - - for (auto &phase_data : sd->node_comm_) { - vt_debug_print( - normal, replay, - "found {} comms for phase {}\n", - phase_data.second.size(), phase_data.first - ); - } - - return sd; -} - /*static*/ std::shared_ptr WorkloadDataMigrator::relocateWorkloadsForReplay( diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 2003856e62..548dd56219 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -94,6 +94,17 @@ void replayWorkloads( std::shared_ptr workloads ); +/** + * \brief Build a StatsData object from the LB data in a json file + * + * \param[in] filename read in LB data from the specified json file + * + * \return the StatsData object built from the LB data + */ +std::shared_ptr +readInWorkloads(const std::string &filename); + + /** * \struct WorkloadDataMigrator * @@ -139,16 +150,6 @@ struct WorkloadDataMigrator : lb::BaseLB { std::shared_ptr lb_reassignment ); - /** - * \brief Build a StatsData object from the LB data in a json file - * - * \param[in] filename read in LB data from the specified json file - * - * \return the StatsData object built from the LB data - */ - static std::shared_ptr - readInWorkloads(std::string filename); - /** * \brief Relocate object workloads to the rank where the objects are supposed * to exist during this phase From b24acee180e471055a556f59fca68676424a7236 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 13 Apr 2022 12:04:44 -0700 Subject: [PATCH 27/41] #1265: replay: update to use lb callbacks --- .../vrt/collection/balance/workload_replay.cc | 53 ++++++++----- .../collection/test_workload_data_migrator.cc | 79 +++++++++---------- 2 files changed, 70 insertions(+), 62 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index b57198cd38..08dca7ec06 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -177,28 +177,43 @@ void replayWorkloads( runInEpochCollective("WorkloadReplayDriver -> runRealLB", [&] { // run the load balancer but don't let it automatically migrate; // instead, remember where the LB wanted to migrate objects - auto lb_reassignment = theLBManager()->selectStartLB(phase); - if (lb_reassignment) { - auto proposed_model = std::make_shared( - pre_lb_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - migratable_objects_here.clear(); - for (auto it = proposed_model->begin(); it.isValid(); ++it) { - if ((*it).isMigratable()) { - migratable_objects_here.insert(*it); - vt_debug_print( - normal, replay, - "element {} is here on phase {} after LB\n", *it, phase - ); + std::shared_ptr proposed_model = nullptr; + auto postLBWork = [&](ReassignmentMsg *msg) { + auto lb_reassignment = msg->reassignment; + if (lb_reassignment) { + proposed_model = std::make_shared( + pre_lb_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) + ); + migratable_objects_here.clear(); + for (auto it = proposed_model->begin(); it.isValid(); ++it) { + if ((*it).isMigratable()) { + migratable_objects_here.insert(*it); + vt_debug_print( + normal, replay, + "element {} is here on phase {} after LB\n", *it, phase + ); + } } } - } - vt_debug_print( - terse, replay, - "Number of objects after LB: {}\n", migratable_objects_here.size() + vt_debug_print( + terse, replay, + "Number of objects after LB: {}\n", migratable_objects_here.size() + ); + runInEpochCollective("postLBWorkForReplay -> computeStats", [=] { + auto stats_cb = vt::theCB()->makeBcast< + LBManager, balance::NodeStatsMsg, &LBManager::statsHandler + >(theLBManager()->getProxy()); + theLBManager()->computeStatistics( + proposed_model, false, phase, stats_cb + ); + }); + }; + auto cb = theCB()->makeFunc( + vt::pipe::LifetimeEnum::Once, postLBWork ); + theLBManager()->selectStartLB(phase, cb); }); runInEpochCollective("WorkloadReplayDriver -> destroyLB", [&] { theLBManager()->destroyLB(); @@ -240,7 +255,7 @@ objgroup::proxy::Proxy WorkloadDataMigrator::construct(std::shared_ptr model_base) { auto my_proxy = theObjGroup()->makeCollective(); auto strat = my_proxy.get(); - auto base_proxy = my_proxy.template registerBaseCollective(); + auto base_proxy = my_proxy.template castToBase(); vt_debug_print( verbose, replay, "WorkloadDataMigrator proxy={} base_proxy={}\n", diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 39738fcf60..07c129158b 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -63,6 +63,7 @@ using namespace vt::tests::unit; using vt::vrt::collection::balance::StatsData; using vt::vrt::collection::balance::LoadModel; using vt::vrt::collection::balance::ProposedReassignment; +using vt::vrt::collection::balance::ReassignmentMsg; using vt::vrt::collection::balance::replay::WorkloadDataMigrator; struct TestWorkloadDataMigrator : TestParallelHarness { }; @@ -108,28 +109,34 @@ setupBaseModel(PhaseType phase, std::shared_ptr sd) { } std::shared_ptr -shiftObjectsRight( +migrateObjects( std::shared_ptr base_load_model, - vt::PhaseType phase + vt::PhaseType phase, + vt::vrt::collection::balance::LBType balancer ) { std::shared_ptr new_model = nullptr; - vt::runInEpochCollective("do shift", [&]{ - using vt::vrt::collection::balance::LBType; - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RotateLB); - if (lb_reassignment != nullptr) { - vt_debug_print( - normal, replay, - "global_mig={}, depart={}, arrive={}\n", - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - new_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - } + vt::runInEpochCollective("migrate", [&]{ + auto postLBWork = [&](ReassignmentMsg *msg) { + auto lb_reassignment = msg->reassignment; + if (lb_reassignment) { + vt_debug_print( + normal, replay, + "global_mig={}, depart={}, arrive={}\n", + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + new_model = std::make_shared( + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) + ); + } + }; + auto cb = theCB()->makeFunc( + vt::pipe::LifetimeEnum::Once, postLBWork + ); + theLBManager()->startLB(phase, balancer, cb); }); runInEpochCollective("destroy lb", [&]{ @@ -140,35 +147,21 @@ shiftObjectsRight( } std::shared_ptr -shiftObjectsRandomly( +shiftObjectsRight( std::shared_ptr base_load_model, vt::PhaseType phase ) { - std::shared_ptr new_model = nullptr; - - vt::runInEpochCollective("do shift", [&]{ - using vt::vrt::collection::balance::LBType; - auto lb_reassignment = vt::theLBManager()->startLB(phase, LBType::RandomLB); - if (lb_reassignment != nullptr) { - vt_debug_print( - normal, replay, - "global_mig={}, depart={}, arrive={}\n", - lb_reassignment->global_migration_count, - lb_reassignment->depart_.size(), - lb_reassignment->arrive_.size() - ); - new_model = std::make_shared( - base_load_model, - WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) - ); - } - }); - - runInEpochCollective("destroy lb", [&]{ - vt::theLBManager()->destroyLB(); - }); + using vt::vrt::collection::balance::LBType; + return migrateObjects(base_load_model, phase, LBType::RotateLB); +} - return new_model; +std::shared_ptr +shiftObjectsRandomly( + std::shared_ptr base_load_model, + vt::PhaseType phase +) { + using vt::vrt::collection::balance::LBType; + return migrateObjects(base_load_model, phase, LBType::RandomLB); } From d124d63a5733e2c28411d0da1fd1df3bc0d41edc Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 2 May 2022 09:57:30 -0700 Subject: [PATCH 28/41] #1265: replay: update to reflect stats renaming --- .../vrt/collection/balance/workload_replay.cc | 18 ++--- .../vrt/collection/balance/workload_replay.h | 14 ++-- .../collection/test_workload_data_migrator.cc | 70 +++++++++---------- tools/workload_replay/simulate_replay.cc | 4 +- 4 files changed, 53 insertions(+), 53 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 08dca7ec06..9517708d2b 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -43,7 +43,7 @@ #include "vt/config.h" #include "vt/vrt/collection/balance/workload_replay.h" -#include "vt/vrt/collection/balance/stats_data.h" +#include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" #include "vt/utils/json/json_reader.h" @@ -58,7 +58,7 @@ void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run ) { // read in object loads from json files - auto const filename = theConfig()->getLBStatsFileIn(); + auto const filename = theConfig()->getLBDataFileIn(); auto workloads = readInWorkloads(filename); replayWorkloads(initial_phase, phases_to_run, workloads); @@ -66,7 +66,7 @@ void replayWorkloads( void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, - std::shared_ptr workloads + std::shared_ptr workloads ) { using ObjIDType = elm::ElementIDStruct; @@ -222,13 +222,13 @@ void replayWorkloads( } } -std::shared_ptr +std::shared_ptr readInWorkloads(const std::string &filename) { using util::json::Reader; Reader r{filename}; auto json = r.readFile(); - auto sd = std::make_shared(*json); + auto sd = std::make_shared(*json); for (auto &phase_data : sd->node_data_) { vt_debug_print( @@ -329,7 +329,7 @@ WorkloadDataMigrator::relocateMisplacedWorkloadsHome( ) { std::shared_ptr move_home_model = nullptr; - runInEpochCollective("WorkloadDataMigrator -> migrateStatsDataHome", [&] { + runInEpochCollective("WorkloadDataMigrator -> migrateLBDataHome", [&] { auto norm_lb_proxy = WorkloadDataMigrator::construct(model_base); auto normalizer = norm_lb_proxy.get(); move_home_model = normalizer->createModelToMoveWorkloadsHome( @@ -349,7 +349,7 @@ WorkloadDataMigrator::relocateMisplacedWorkloadsHere( ) { std::shared_ptr move_here_model = nullptr; - runInEpochCollective("WorkloadDataMigrator -> migrateStatsDataHere", [&] { + runInEpochCollective("WorkloadDataMigrator -> migrateLBDataHere", [&] { auto norm_lb_proxy = WorkloadDataMigrator::construct(model_base); auto normalizer = norm_lb_proxy.get(); move_here_model = normalizer->createModelToMoveWorkloadsHere( @@ -372,7 +372,7 @@ WorkloadDataMigrator::createModelToMoveWorkloadsHome( "constructing load model to get loads from file location to home\n" ); - runInEpochCollective("WorkloadDataMigrator -> transferStatsHome", [&] { + runInEpochCollective("WorkloadDataMigrator -> transferLBDataHome", [&] { for (auto workload_id : *model_base) { if (workload_id.isMigratable()) { // if the object belongs here, do nothing; otherwise, "transfer" it to @@ -408,7 +408,7 @@ WorkloadDataMigrator::createModelToMoveWorkloadsHere( "constructing load model to get loads from home to here\n" ); - runInEpochCollective("WorkloadDataMigrator -> transferStatsHere", [&] { + runInEpochCollective("WorkloadDataMigrator -> transferLBDataHere", [&] { for (auto workload_id : migratable_objects_here) { // if the object is already here, do nothing; otherwise, "transfer" it // from the home rank so that we will have the needed workload data diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 548dd56219..359173a507 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -46,7 +46,7 @@ #include "vt/config.h" #include "vt/elm/elm_id.h" -#include "vt/vrt/collection/balance/stats_data.h" +#include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/baselb/baselb.h" #include "vt/vrt/collection/balance/model/load_model.h" #include "vt/vrt/collection/balance/model/proposed_reassignment.h" @@ -65,8 +65,8 @@ namespace balance { namespace replay { * \param[in] initial_phase the first phase to replay * \param[in] phases_to_run how many phases to replay * - * The json files specified by the command-line arguments --vt_lb_stats_file_in - * and --vt_lb_stats_dir_in will be imported and the LB data contained within + * The json files specified by the command-line arguments --vt_lb_data_file_in + * and --vt_lb_data_dir_in will be imported and the LB data contained within * will be fed through the load balancer(s) specified on the vt command-line * on each requested phase, allowing new load balancing decisions to happen. * There is no requirement to colocate the LB data on the same rank as the @@ -91,17 +91,17 @@ void replayWorkloads( */ void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, - std::shared_ptr workloads + std::shared_ptr workloads ); /** - * \brief Build a StatsData object from the LB data in a json file + * \brief Build a LBDataHolder object from the LB data in a json file * * \param[in] filename read in LB data from the specified json file * - * \return the StatsData object built from the LB data + * \return the LBDataHolder object built from the LB data */ -std::shared_ptr +std::shared_ptr readInWorkloads(const std::string &filename); diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 07c129158b..03337a3783 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -49,7 +49,7 @@ #include "vt/elm/elm_id.h" #include "vt/elm/elm_id_bits.h" #include "vt/vrt/collection/balance/lb_common.h" -#include "vt/vrt/collection/balance/stats_data.h" +#include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" #include "vt/vrt/collection/balance/workload_replay.h" #include "vt/vrt/collection/balance/model/proposed_reassignment.h" @@ -60,7 +60,7 @@ namespace vt { namespace tests { namespace unit { namespace replay { using namespace vt::tests::unit; -using vt::vrt::collection::balance::StatsData; +using vt::vrt::collection::balance::LBDataHolder; using vt::vrt::collection::balance::LoadModel; using vt::vrt::collection::balance::ProposedReassignment; using vt::vrt::collection::balance::ReassignmentMsg; @@ -68,7 +68,7 @@ using vt::vrt::collection::balance::replay::WorkloadDataMigrator; struct TestWorkloadDataMigrator : TestParallelHarness { }; -std::shared_ptr +std::shared_ptr setupWorkloads(PhaseType phase, size_t numElements) { auto const& this_node = vt::theContext()->getNode(); @@ -82,24 +82,24 @@ setupWorkloads(PhaseType phase, size_t numElements) { ); } - auto sd = std::make_shared(); + auto lbdh = std::make_shared(); for (auto&& elmID : myElemList) { double tval = elmID.id * 2; - sd->node_data_[phase][elmID].whole_phase_load = tval; - auto &subphase_loads = sd->node_data_[phase][elmID].subphase_loads; + lbdh->node_data_[phase][elmID].whole_phase_load = tval; + auto &subphase_loads = lbdh->node_data_[phase][elmID].subphase_loads; subphase_loads.push_back(elmID.id % 2 ? tval : 0); subphase_loads.push_back(elmID.id % 2 ? 0 : tval); } - return sd; + return lbdh; } std::shared_ptr -setupBaseModel(PhaseType phase, std::shared_ptr sd) { +setupBaseModel(PhaseType phase, std::shared_ptr lbdh) { auto base_load_model = vt::theLBManager()->getBaseLoadModel(); // force it to use our json workloads, not anything it may have collected - base_load_model->setLoads(&sd->node_data_, &sd->node_comm_); + base_load_model->setLoads(&lbdh->node_data_, &lbdh->node_comm_); vt::runInEpochCollective("updateLoads", [&]{ base_load_model->updateLoads(phase); @@ -172,8 +172,8 @@ TEST_F(TestWorkloadDataMigrator, test_normalize_call) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); vt::objgroup::proxy::Proxy norm_lb_proxy; std::shared_ptr new_model = nullptr; @@ -231,8 +231,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_home) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // move everything off the home node std::shared_ptr not_home_model = shiftObjectsRight( @@ -280,8 +280,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // move everything off the home node std::shared_ptr not_home_model = shiftObjectsRight( @@ -342,8 +342,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // move everything off the home node std::shared_ptr not_home_model = shiftObjectsRight( @@ -394,8 +394,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // move everything off the home node std::shared_ptr not_home_model = shiftObjectsRight( @@ -456,8 +456,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_1) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // shift the workloads to not be home std::shared_ptr workloads_not_home_model = @@ -510,8 +510,8 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { PhaseType phase = 0; const size_t numElements = 5; - auto sd = setupWorkloads(phase, numElements); - auto base_load_model = setupBaseModel(phase, sd); + auto lbdh = setupWorkloads(phase, numElements); + auto base_load_model = setupBaseModel(phase, lbdh); // put the workloads whereever std::shared_ptr workloads_whereever_model = @@ -558,7 +558,7 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { } } -std::shared_ptr +std::shared_ptr setupManyWorkloads( PhaseType initial_phase, PhaseType num_phases, size_t numElements ) { @@ -574,24 +574,24 @@ setupManyWorkloads( ); } - auto sd = std::make_shared(); + auto lbdh = std::make_shared(); PhaseType stop_phase = initial_phase + num_phases; for (PhaseType phase = initial_phase; phase < stop_phase; ++phase) { for (size_t ii = 0; ii < numElements; ++ii) { auto elmID = myElemList[ii]; double tval = this_node + (ii + 10) * 2; - sd->node_data_[phase][elmID].whole_phase_load = tval + phase; - auto &subphase_loads = sd->node_data_[phase][elmID].subphase_loads; + lbdh->node_data_[phase][elmID].whole_phase_load = tval + phase; + auto &subphase_loads = lbdh->node_data_[phase][elmID].subphase_loads; subphase_loads.push_back(elmID.id % 2 ? tval : phase); subphase_loads.push_back(elmID.id % 2 ? phase : tval); } } - auto scrambled_sd = std::make_shared(); + auto scrambled_lbdh = std::make_shared(); for (PhaseType phase = initial_phase; phase < stop_phase; ++phase) { - auto base_load_model = setupBaseModel(phase, sd); + auto base_load_model = setupBaseModel(phase, lbdh); std::shared_ptr not_home_model = shiftObjectsRight(base_load_model, phase); @@ -613,20 +613,20 @@ setupManyWorkloads( for (auto it = here_model->begin(); it.isValid(); ++it) { auto obj_id = *it; using vt::vrt::collection::balance::PhaseOffset; - scrambled_sd->node_data_[phase][obj_id].whole_phase_load = + scrambled_lbdh->node_data_[phase][obj_id].whole_phase_load = here_model->getWork( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); - scrambled_sd->node_data_[phase][*it].subphase_loads.push_back( + scrambled_lbdh->node_data_[phase][*it].subphase_loads.push_back( here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 0}) ); - scrambled_sd->node_data_[phase][*it].subphase_loads.push_back( + scrambled_lbdh->node_data_[phase][*it].subphase_loads.push_back( here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 1}) ); } } - return scrambled_sd; + return scrambled_lbdh; } struct TestWorkloadReplay : TestParallelHarness { @@ -645,11 +645,11 @@ TEST_F(TestWorkloadReplay, test_run_replay_no_verify) { const size_t numElements = 5; // first set up the workloads to replay, moving them around by phase - auto sd = setupManyWorkloads(initial_phase, num_phases, numElements); + auto lbdh = setupManyWorkloads(initial_phase, num_phases, numElements); // then replay them but allow the lb to place objects differently vt::vrt::collection::balance::replay::replayWorkloads( - initial_phase, num_phases, sd + initial_phase, num_phases, lbdh ); } diff --git a/tools/workload_replay/simulate_replay.cc b/tools/workload_replay/simulate_replay.cc index ec2803cbaa..8a7045c655 100644 --- a/tools/workload_replay/simulate_replay.cc +++ b/tools/workload_replay/simulate_replay.cc @@ -53,7 +53,7 @@ int main(int argc, char** argv) { argc != 3, "Must have two app-specific arguments: \n" "The json workload files needs to be specified using\n" - "--vt_lb_stats_file_in and --vt_lb_stats_dir_in" + "--vt_lb_data_file_in and --vt_lb_data_dir_in" ); // initial phase to simulate @@ -62,7 +62,7 @@ int main(int argc, char** argv) { PhaseType phases_to_run = atoi(argv[2]); // the workloads used will be those specified with the command-line arguments - // --vt_lb_stats_file_in and --vt_lb_stats_dir_in + // --vt_lb_data_file_in and --vt_lb_data_dir_in vt::vrt::collection::balance::replay::replayWorkloads( initial_phase, phases_to_run ); From dafa15df897db797ca52fb0f7cace6736f1a9ca0 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 9 May 2022 13:02:20 -0700 Subject: [PATCH 29/41] #1265: replay: allow custom stats callback for testing --- .../vrt/collection/balance/workload_replay.cc | 13 ++++++----- .../vrt/collection/balance/workload_replay.h | 4 +++- .../collection/test_workload_data_migrator.cc | 22 ++++++++++++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 9517708d2b..ab2c6b7194 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -61,12 +61,18 @@ void replayWorkloads( auto const filename = theConfig()->getLBDataFileIn(); auto workloads = readInWorkloads(filename); - replayWorkloads(initial_phase, phases_to_run, workloads); + // use the default stats handler + auto stats_cb = vt::theCB()->makeBcast< + LBManager, balance::NodeStatsMsg, &LBManager::statsHandler + >(theLBManager()->getProxy()); + + replayWorkloads(initial_phase, phases_to_run, workloads, stats_cb); } void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, - std::shared_ptr workloads + std::shared_ptr workloads, + Callback stats_cb ) { using ObjIDType = elm::ElementIDStruct; @@ -202,9 +208,6 @@ void replayWorkloads( "Number of objects after LB: {}\n", migratable_objects_here.size() ); runInEpochCollective("postLBWorkForReplay -> computeStats", [=] { - auto stats_cb = vt::theCB()->makeBcast< - LBManager, balance::NodeStatsMsg, &LBManager::statsHandler - >(theLBManager()->getProxy()); theLBManager()->computeStatistics( proposed_model, false, phase, stats_cb ); diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 359173a507..94ec7f891a 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -46,6 +46,7 @@ #include "vt/config.h" #include "vt/elm/elm_id.h" +#include "vt/vrt/collection/balance/stats_msg.h" #include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/baselb/baselb.h" #include "vt/vrt/collection/balance/model/load_model.h" @@ -91,7 +92,8 @@ void replayWorkloads( */ void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, - std::shared_ptr workloads + std::shared_ptr workloads, + Callback stats_cb ); /** diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 03337a3783..f7911ef0bb 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -48,6 +48,7 @@ #include "vt/elm/elm_id.h" #include "vt/elm/elm_id_bits.h" +#include "vt/vrt/collection/balance/stats_msg.h" #include "vt/vrt/collection/balance/lb_common.h" #include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" @@ -72,6 +73,10 @@ std::shared_ptr setupWorkloads(PhaseType phase, size_t numElements) { auto const& this_node = vt::theContext()->getNode(); + if (this_node == 0) { + vt_print(replay, "Generating workloads to replay...\n"); + } + using vt::vrt::collection::balance::ElementIDStruct; std::vector myElemList(numElements); @@ -564,6 +569,10 @@ setupManyWorkloads( ) { auto const& this_node = vt::theContext()->getNode(); + if (this_node == 0) { + vt_print(replay, "Generating workloads to replay...\n"); + } + using vt::vrt::collection::balance::ElementIDStruct; std::vector myElemList(numElements); @@ -634,22 +643,29 @@ struct TestWorkloadReplay : TestParallelHarness { void addAdditionalArgs() override { static char vt_lb[]{"--vt_lb"}; static char vt_lb_name[]{"--vt_lb_name=RandomLB"}; - addArgs(vt_lb, vt_lb_name); + static char vt_lb_interval[]{"--vt_lb_interval=2"}; + addArgs(vt_lb, vt_lb_name, vt_lb_interval); } #endif }; TEST_F(TestWorkloadReplay, test_run_replay_no_verify) { PhaseType initial_phase = 1; - PhaseType num_phases = 3; + PhaseType num_phases = 5; const size_t numElements = 5; // first set up the workloads to replay, moving them around by phase auto lbdh = setupManyWorkloads(initial_phase, num_phases, numElements); + using LBManager = vt::vrt::collection::balance::LBManager; + using NodeStatsMsg = vt::vrt::collection::balance::NodeStatsMsg; + auto stats_cb = vt::theCB()->makeBcast< + LBManager, NodeStatsMsg, &LBManager::statsHandler + >(vt::theLBManager()->getProxy()); + // then replay them but allow the lb to place objects differently vt::vrt::collection::balance::replay::replayWorkloads( - initial_phase, num_phases, lbdh + initial_phase, num_phases, lbdh, stats_cb ); } From b9c9263842d6b258394fac6a65ade338e81f831b Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 9 May 2022 13:08:26 -0700 Subject: [PATCH 30/41] #1265: tests: verify statistics under replay --- .../collection/test_workload_data_migrator.cc | 146 +++++++++++++++++- 1 file changed, 138 insertions(+), 8 deletions(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index f7911ef0bb..4a731ab378 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -563,9 +563,133 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { } } + +struct StatsResults { + StatsResults(PhaseType initial_phase, PhaseType lb_interval) + : save_phase_(initial_phase), + comp_phase_(initial_phase), + lb_interval_(lb_interval) { } + + PhaseType save_phase_ = 0; + PhaseType comp_phase_ = 0; + PhaseType lb_interval_ = 1; + + std::unordered_map O_min_; + std::unordered_map O_max_; + std::unordered_map O_car_; + std::unordered_map P_sum_; + + using StatsMsgType = vt::vrt::collection::balance::NodeStatsMsg; + using Statistic = vt::vrt::collection::lb::Statistic; + + void saveStatsHandler(StatsMsgType* msg) { + auto in_stat_vec = msg->getConstVal(); + + auto const& this_node = vt::theContext()->getNode(); + + if (this_node == 0) { + vt_print(replay, "Saving subset of statistics for phase {}\n", comp_phase_); + } + + for (auto&& st : in_stat_vec) { + auto stat = st.stat_; + if (stat == Statistic::P_l) { + P_sum_[save_phase_] = st.sum(); + } else if (stat == Statistic::O_l) { + O_min_[save_phase_] = st.min(); + O_max_[save_phase_] = st.max(); + O_car_[save_phase_] = st.N_; + } + } + + ++save_phase_; + } + + void compStatsHandler(StatsMsgType* msg) { + auto in_stat_vec = msg->getConstVal(); + + auto const& this_node = vt::theContext()->getNode(); + + if (this_node == 0) { + vt_print(replay, "Comparing subset of post-LB statistics for phase {}\n", comp_phase_); + } + + for (auto&& st : in_stat_vec) { + auto stat = st.stat_; + if (stat == Statistic::P_l) { + EXPECT_EQ(P_sum_[comp_phase_], st.sum()); + } else if (stat == Statistic::O_l) { + EXPECT_EQ(O_min_[comp_phase_], st.min()); + EXPECT_EQ(O_max_[comp_phase_], st.max()); + EXPECT_EQ(O_car_[comp_phase_], st.N_); + } + } + + comp_phase_ += lb_interval_; + } +}; + +std::shared_ptr +migrateObjectsAndDoStatistics( + std::shared_ptr base_load_model, + vt::PhaseType phase, + vt::vrt::collection::balance::LBType balancer, + vt::objgroup::proxy::Proxy o_proxy +) { + std::shared_ptr new_model = nullptr; + + vt::runInEpochCollective("migrate", [&]{ + auto postLBWork = [&](ReassignmentMsg *msg) { + auto lb_reassignment = msg->reassignment; + if (lb_reassignment) { + vt_debug_print( + normal, replay, + "global_mig={}, depart={}, arrive={}\n", + lb_reassignment->global_migration_count, + lb_reassignment->depart_.size(), + lb_reassignment->arrive_.size() + ); + new_model = std::make_shared( + base_load_model, + WorkloadDataMigrator::updateCurrentNodes(lb_reassignment) + ); + runInEpochCollective("computeAndStoreStats", [=] { + auto stats_cb = vt::theCB()->makeBcast< + StatsResults, StatsResults::StatsMsgType, + &StatsResults::saveStatsHandler + >(o_proxy); + theLBManager()->computeStatistics(new_model, false, phase, stats_cb); + }); + } + }; + auto cb = theCB()->makeFunc( + vt::pipe::LifetimeEnum::Once, postLBWork + ); + theLBManager()->startLB(phase, balancer, cb); + }); + + runInEpochCollective("destroy lb", [&]{ + vt::theLBManager()->destroyLB(); + }); + + return new_model; +} + +std::shared_ptr +shiftObjectsRightAndDoStatistics( + std::shared_ptr base_load_model, + vt::PhaseType phase, vt::objgroup::proxy::Proxy o_proxy +) { + using vt::vrt::collection::balance::LBType; + return migrateObjectsAndDoStatistics( + base_load_model, phase, LBType::RotateLB, o_proxy + ); +} + std::shared_ptr setupManyWorkloads( - PhaseType initial_phase, PhaseType num_phases, size_t numElements + PhaseType initial_phase, PhaseType num_phases, size_t numElements, + vt::objgroup::proxy::Proxy o_proxy ) { auto const& this_node = vt::theContext()->getNode(); @@ -603,7 +727,7 @@ setupManyWorkloads( auto base_load_model = setupBaseModel(phase, lbdh); std::shared_ptr not_home_model = - shiftObjectsRight(base_load_model, phase); + shiftObjectsRightAndDoStatistics(base_load_model, phase, o_proxy); std::set migratable_objects_here; for (auto it = not_home_model->begin(); it.isValid(); ++it) { @@ -649,19 +773,25 @@ struct TestWorkloadReplay : TestParallelHarness { #endif }; -TEST_F(TestWorkloadReplay, test_run_replay_no_verify) { +TEST_F(TestWorkloadReplay, test_run_replay_verify_some_stats) { PhaseType initial_phase = 1; PhaseType num_phases = 5; const size_t numElements = 5; + const PhaseType lb_interval = 2; // make sure this matches the harness above + + auto o_proxy = vt::theObjGroup()->makeCollective( + initial_phase, lb_interval + ); // first set up the workloads to replay, moving them around by phase - auto lbdh = setupManyWorkloads(initial_phase, num_phases, numElements); + auto lbdh = setupManyWorkloads( + initial_phase, num_phases, numElements, o_proxy + ); - using LBManager = vt::vrt::collection::balance::LBManager; - using NodeStatsMsg = vt::vrt::collection::balance::NodeStatsMsg; + // make our own stats callback so that we can check the results auto stats_cb = vt::theCB()->makeBcast< - LBManager, NodeStatsMsg, &LBManager::statsHandler - >(vt::theLBManager()->getProxy()); + StatsResults, StatsResults::StatsMsgType, &StatsResults::compStatsHandler + >(o_proxy); // then replay them but allow the lb to place objects differently vt::vrt::collection::balance::replay::replayWorkloads( From c896627c508df9949b85ed3c155568bae0902900 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Mon, 9 May 2022 13:15:30 -0700 Subject: [PATCH 31/41] #1265: replay: fix doxygen --- src/vt/vrt/collection/balance/workload_replay.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 94ec7f891a..49d600bf7a 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -84,6 +84,7 @@ void replayWorkloads( * \param[in] initial_phase the first phase to replay * \param[in] phases_to_run how many phases to replay * \param[in] workloads the workload data to simulate + * \param[in] stats_cb callback for post-lb statistics * * LB data passed in will be fed through the load balancer(s) specified on the * vt command-line on each requested phase, allowing new load balancing From 3af9b7b51c75f6e5d62d111cf342ebb79a4c4421 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 16 Nov 2022 12:27:07 -0800 Subject: [PATCH 32/41] #1265: replay: make compatibility updates --- src/vt/vrt/collection/balance/workload_replay.cc | 6 ++++-- src/vt/vrt/collection/balance/workload_replay.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index ab2c6b7194..d643cdcaa3 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -256,7 +256,9 @@ readInWorkloads(const std::string &filename) { /*static*/ objgroup::proxy::Proxy WorkloadDataMigrator::construct(std::shared_ptr model_base) { - auto my_proxy = theObjGroup()->makeCollective(); + auto my_proxy = theObjGroup()->makeCollective( + "WorkloadDataMigrator" + ); auto strat = my_proxy.get(); auto base_proxy = my_proxy.template castToBase(); vt_debug_print( @@ -271,7 +273,7 @@ WorkloadDataMigrator::construct(std::shared_ptr model_base) { void WorkloadDataMigrator::runLB(TimeType) { } -void WorkloadDataMigrator::inputParams(SpecEntry* spec) { } +void WorkloadDataMigrator::inputParams(ConfigEntry* spec) { } std::unordered_map WorkloadDataMigrator::getInputKeysWithHelp() { diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index 49d600bf7a..a567270f60 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -134,7 +134,7 @@ struct WorkloadDataMigrator : lb::BaseLB { void runLB(TimeType) override; - void inputParams(SpecEntry* spec) override; + void inputParams(ConfigEntry* spec) override; static std::unordered_map getInputKeysWithHelp(); From d1b2194c0461d770282e0d7ec4fd95f0fae60a60 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 17 Oct 2023 12:15:10 -0700 Subject: [PATCH 33/41] #1265: tests: make compatibility updates --- .../collection/test_workload_data_migrator.cc | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 4a731ab378..7b31aad23e 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -214,15 +214,15 @@ TEST_F(TestWorkloadDataMigrator, test_normalize_call) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = new_model->getWork( + auto load = new_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = new_model->getWork( + auto subload0 = new_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = new_model->getWork( + auto subload1 = new_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -262,15 +262,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_home) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = back_home_model->getWork( + auto load = back_home_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = back_home_model->getWork( + auto subload0 = back_home_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = back_home_model->getWork( + auto subload1 = back_home_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -324,15 +324,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_home) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = back_home_if_not_here_model->getWork( + auto load = back_home_if_not_here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = back_home_if_not_here_model->getWork( + auto subload0 = back_home_if_not_here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = back_home_if_not_here_model->getWork( + auto subload1 = back_home_if_not_here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -376,15 +376,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_home) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = here_model->getWork( + auto load = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = here_model->getWork( + auto subload0 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = here_model->getWork( + auto subload1 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -439,15 +439,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_some_data_here_from_home) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = here_model->getWork( + auto load = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = here_model->getWork( + auto subload0 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = here_model->getWork( + auto subload1 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -493,15 +493,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_1) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = here_model->getWork( + auto load = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = here_model->getWork( + auto subload0 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = here_model->getWork( + auto subload1 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -547,15 +547,15 @@ TEST_F(TestWorkloadDataMigrator, test_move_data_here_from_whereever_2) { EXPECT_EQ(obj_id.getCurrNode(), this_node); using vt::vrt::collection::balance::PhaseOffset; - auto load = here_model->getWork( + auto load = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); EXPECT_EQ(load, obj_id.id * 2); - auto subload0 = here_model->getWork( + auto subload0 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 0} ); EXPECT_EQ(subload0, obj_id.id % 2 ? obj_id.id * 2 : 0); - auto subload1 = here_model->getWork( + auto subload1 = here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, 1} ); EXPECT_EQ(subload1, obj_id.id % 2 ? 0 : obj_id.id * 2); @@ -593,9 +593,9 @@ struct StatsResults { for (auto&& st : in_stat_vec) { auto stat = st.stat_; - if (stat == Statistic::P_l) { + if (stat == Statistic::Rank_load_modeled) { P_sum_[save_phase_] = st.sum(); - } else if (stat == Statistic::O_l) { + } else if (stat == Statistic::Object_load_modeled) { O_min_[save_phase_] = st.min(); O_max_[save_phase_] = st.max(); O_car_[save_phase_] = st.N_; @@ -616,9 +616,9 @@ struct StatsResults { for (auto&& st : in_stat_vec) { auto stat = st.stat_; - if (stat == Statistic::P_l) { + if (stat == Statistic::Rank_load_modeled) { EXPECT_EQ(P_sum_[comp_phase_], st.sum()); - } else if (stat == Statistic::O_l) { + } else if (stat == Statistic::Object_load_modeled) { EXPECT_EQ(O_min_[comp_phase_], st.min()); EXPECT_EQ(O_max_[comp_phase_], st.max()); EXPECT_EQ(O_car_[comp_phase_], st.N_); @@ -747,14 +747,14 @@ setupManyWorkloads( auto obj_id = *it; using vt::vrt::collection::balance::PhaseOffset; scrambled_lbdh->node_data_[phase][obj_id].whole_phase_load = - here_model->getWork( + here_model->getModeledLoad( obj_id, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE} ); scrambled_lbdh->node_data_[phase][*it].subphase_loads.push_back( - here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 0}) + here_model->getModeledLoad(obj_id, {PhaseOffset::NEXT_PHASE, 0}) ); scrambled_lbdh->node_data_[phase][*it].subphase_loads.push_back( - here_model->getWork(obj_id, {PhaseOffset::NEXT_PHASE, 1}) + here_model->getModeledLoad(obj_id, {PhaseOffset::NEXT_PHASE, 1}) ); } } @@ -780,7 +780,7 @@ TEST_F(TestWorkloadReplay, test_run_replay_verify_some_stats) { const PhaseType lb_interval = 2; // make sure this matches the harness above auto o_proxy = vt::theObjGroup()->makeCollective( - initial_phase, lb_interval + "StatsResults", initial_phase, lb_interval ); // first set up the workloads to replay, moving them around by phase From 3f7c9724c96566c09f591f93fb6b3583c73a2e40 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 17 Oct 2023 12:42:09 -0700 Subject: [PATCH 34/41] #1265: replay: make compatibility updates --- src/vt/vrt/collection/balance/workload_replay.cc | 6 +++--- src/vt/vrt/collection/balance/workload_replay.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index d643cdcaa3..49a22144f4 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -63,7 +63,7 @@ void replayWorkloads( // use the default stats handler auto stats_cb = vt::theCB()->makeBcast< - LBManager, balance::NodeStatsMsg, &LBManager::statsHandler + &LBManager::statsHandler >(theLBManager()->getProxy()); replayWorkloads(initial_phase, phases_to_run, workloads, stats_cb); @@ -72,7 +72,7 @@ void replayWorkloads( void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, std::shared_ptr workloads, - Callback stats_cb + Callback> stats_cb ) { using ObjIDType = elm::ElementIDStruct; @@ -271,7 +271,7 @@ WorkloadDataMigrator::construct(std::shared_ptr model_base) { return my_proxy; } -void WorkloadDataMigrator::runLB(TimeType) { } +void WorkloadDataMigrator::runLB(LoadType) { } void WorkloadDataMigrator::inputParams(ConfigEntry* spec) { } diff --git a/src/vt/vrt/collection/balance/workload_replay.h b/src/vt/vrt/collection/balance/workload_replay.h index a567270f60..2e36b39231 100644 --- a/src/vt/vrt/collection/balance/workload_replay.h +++ b/src/vt/vrt/collection/balance/workload_replay.h @@ -94,7 +94,7 @@ void replayWorkloads( void replayWorkloads( PhaseType initial_phase, PhaseType phases_to_run, std::shared_ptr workloads, - Callback stats_cb + Callback> stats_cb ); /** @@ -132,7 +132,7 @@ struct WorkloadDataMigrator : lb::BaseLB { static objgroup::proxy::Proxy construct(std::shared_ptr model_base); - void runLB(TimeType) override; + void runLB(LoadType) override; void inputParams(ConfigEntry* spec) override; From d9e0f147a57ee6a7526afd8fca063ec167c7d03b Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 17 Oct 2023 12:42:23 -0700 Subject: [PATCH 35/41] #1265: tests: make compatibility updates --- .../unit/collection/test_workload_data_migrator.cc | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/unit/collection/test_workload_data_migrator.cc b/tests/unit/collection/test_workload_data_migrator.cc index 7b31aad23e..7d20356fff 100644 --- a/tests/unit/collection/test_workload_data_migrator.cc +++ b/tests/unit/collection/test_workload_data_migrator.cc @@ -579,12 +579,10 @@ struct StatsResults { std::unordered_map O_car_; std::unordered_map P_sum_; - using StatsMsgType = vt::vrt::collection::balance::NodeStatsMsg; using Statistic = vt::vrt::collection::lb::Statistic; + using LoadData = vt::vrt::collection::balance::LoadData; - void saveStatsHandler(StatsMsgType* msg) { - auto in_stat_vec = msg->getConstVal(); - + void saveStatsHandler(std::vector const& in_stat_vec) { auto const& this_node = vt::theContext()->getNode(); if (this_node == 0) { @@ -605,9 +603,7 @@ struct StatsResults { ++save_phase_; } - void compStatsHandler(StatsMsgType* msg) { - auto in_stat_vec = msg->getConstVal(); - + void compStatsHandler(std::vector const& in_stat_vec) { auto const& this_node = vt::theContext()->getNode(); if (this_node == 0) { @@ -655,7 +651,6 @@ migrateObjectsAndDoStatistics( ); runInEpochCollective("computeAndStoreStats", [=] { auto stats_cb = vt::theCB()->makeBcast< - StatsResults, StatsResults::StatsMsgType, &StatsResults::saveStatsHandler >(o_proxy); theLBManager()->computeStatistics(new_model, false, phase, stats_cb); @@ -790,7 +785,7 @@ TEST_F(TestWorkloadReplay, test_run_replay_verify_some_stats) { // make our own stats callback so that we can check the results auto stats_cb = vt::theCB()->makeBcast< - StatsResults, StatsResults::StatsMsgType, &StatsResults::compStatsHandler + &StatsResults::compStatsHandler >(o_proxy); // then replay them but allow the lb to place objects differently From 245e27b32e540ad0762ba94065c03d5547495398 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 18 Oct 2023 14:02:46 -0700 Subject: [PATCH 36/41] #1265: phase manager: use phase passed from lb manager --- src/vt/phase/phase_manager.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 99089d10c2..cdb56585d3 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -301,7 +301,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info) phase, "phase={}, duration={}, rank_max_compute_time={}, rank_avg_compute_time={}, imbalance={:.3f}, " "grain_max_time={}, migration count={}, lb_name={}\n", - cur_phase_, + last_phase_info->phase, total_time, TimeType(last_phase_info->max_load), TimeType(last_phase_info->avg_load), @@ -313,7 +313,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info) // vt_print( // phase, // "POST phase={}, total time={}, max_load={}, avg_load={}, imbalance={:.3f}, migration count={}\n", - // cur_phase_, + // last_phase_info->phase, // total_time, // TimeType(last_phase_info->max_load_post_lb), // TimeType(last_phase_info->avg_load_post_lb), @@ -336,7 +336,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info) auto percent_improvement = compute_percent_improvement( last_phase_info->max_load, last_phase_info->avg_load ); - if (percent_improvement > 3.0 and cur_phase_ > 0) { + if (percent_improvement > 3.0 and last_phase_info->phase > 0) { if (grain_percent_improvement < 0.5) { // grain size is blocking improvement vt_print( @@ -395,7 +395,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info) } } } - } else if (cur_phase_ == 0) { + } else if (last_phase_info->phase == 0) { // ran the lb on a phase that may have included initialization costs vt_print( phase, From ae5f09172622351914b013e342310ea0d1200d61 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 18 Oct 2023 14:04:35 -0700 Subject: [PATCH 37/41] #1265: lb manager: add accessors needed for statistics computation --- src/vt/vrt/collection/balance/lb_invoke/lb_manager.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h index a7c8257d73..020a689b03 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h @@ -261,6 +261,10 @@ struct LBManager : runtime::component::Component { void statsHandler(std::vector const& in_stat_vec); + lb::PhaseInfo *getPhaseInfo() { return last_phase_info_.get(); } + + void setComputingBeforeLBStats(bool before_lb) { before_lb_stats_ = before_lb; } + private: bool isCollectiveComm(elm::CommCategory cat) const; From 638bc0b3d42f81faf31e59ebd1ebd381b361798f Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Wed, 18 Oct 2023 14:05:14 -0700 Subject: [PATCH 38/41] #1265: replay: print phase summary when simulating phase --- src/vt/vrt/collection/balance/workload_replay.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 49a22144f4..982ef4a8f9 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -45,6 +45,7 @@ #include "vt/vrt/collection/balance/workload_replay.h" #include "vt/vrt/collection/balance/lb_data_holder.h" #include "vt/vrt/collection/balance/lb_invoke/lb_manager.h" +#include "vt/phase/phase_manager.h" #include "vt/utils/json/json_reader.h" #include @@ -202,12 +203,17 @@ void replayWorkloads( ); } } + auto last_phase_info = theLBManager()->getPhaseInfo(); + last_phase_info->migration_count = lb_reassignment->global_migration_count; + last_phase_info->ran_lb = true; + last_phase_info->phase = phase; } vt_debug_print( terse, replay, "Number of objects after LB: {}\n", migratable_objects_here.size() ); runInEpochCollective("postLBWorkForReplay -> computeStats", [=] { + theLBManager()->setComputingBeforeLBStats(false); theLBManager()->computeStatistics( proposed_model, false, phase, stats_cb ); @@ -222,6 +228,8 @@ void replayWorkloads( theLBManager()->destroyLB(); }); theCollective()->barrier(); + auto last_phase_info = theLBManager()->getPhaseInfo(); + thePhase()->printSummary(last_phase_info); } } From 64dd0997912235d3205344caf39e8389199adb79 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 7 Nov 2023 13:54:54 -0800 Subject: [PATCH 39/41] #1265: build: make tools build option lowercase --- CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d8599bea8..7c1e8e55c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,9 +118,9 @@ endif() # # Tools # -option(VT_BUILD_TOOLS "Build VT tools" ON) +option(vt_build_tools "Build VT tools" ON) -if (VT_BUILD_TOOLS) +if (vt_build_tools) message( STATUS "VT: building tools" @@ -130,8 +130,7 @@ if (VT_BUILD_TOOLS) add_subdirectory(tools) else() message( - STATUS "VT: NOT building tools because VT_BUILD_TOOLS is not set.\ - Tools that are not built are NOT TESTED." + STATUS "VT: NOT building tools because vt_build_tools is not set." ) endif() From 43b775524a1842eddde90ca727d169a487790d34 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 7 Nov 2023 13:55:31 -0800 Subject: [PATCH 40/41] #1265: build: remove testing comment from macro --- tools/CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 2dfcb01d4a..079d49dad9 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -21,15 +21,6 @@ macro(add_tool tool_name) TARGET ${tool_name} DEFAULT_LINK_SET ) - -### @todo Add command-line arguments for testing -# if (BUILD_TESTING) -# add_test_for_example_vt( -# ${tool_name} -# ${TOOL_FILE} -# tool_tests -# ) -# endif() endmacro() add_subdirectory(workload_replay) From 794299dbb6f28ee8608094ba624c3c4593a227c4 Mon Sep 17 00:00:00 2001 From: Nicole Lemaster Slattengren Date: Tue, 7 Nov 2023 13:56:31 -0800 Subject: [PATCH 41/41] #1265: replay: remove unnecessary barrier --- src/vt/vrt/collection/balance/workload_replay.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vt/vrt/collection/balance/workload_replay.cc b/src/vt/vrt/collection/balance/workload_replay.cc index 982ef4a8f9..ebe5e072bd 100644 --- a/src/vt/vrt/collection/balance/workload_replay.cc +++ b/src/vt/vrt/collection/balance/workload_replay.cc @@ -227,7 +227,6 @@ void replayWorkloads( runInEpochCollective("WorkloadReplayDriver -> destroyLB", [&] { theLBManager()->destroyLB(); }); - theCollective()->barrier(); auto last_phase_info = theLBManager()->getPhaseInfo(); thePhase()->printSummary(last_phase_info); }