Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1659: Improve communication statistics in VT #1993

Merged
merged 14 commits into from
Dec 14, 2022
Merged
52 changes: 52 additions & 0 deletions scripts/JSON_data_files_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,19 @@ def _get_valid_schema(self) -> Schema:
"sum": float,
"var": float
},
Optional("Object_strategy_specific_load_modeled"): {
"avg": float,
"car": float,
"imb": float,
"kur": float,
"max": float,
"min": float,
"npr": float,
"skw": float,
"std": float,
"sum": float,
"var": float
},
"Rank_comm": {
"avg": float,
"car": float,
Expand Down Expand Up @@ -184,6 +197,19 @@ def _get_valid_schema(self) -> Schema:
"std": float,
"sum": float,
"var": float
},
Optional("Rank_strategy_specific_load_modeled"): {
"avg": float,
"car": float,
"imb": float,
"kur": float,
"max": float,
"min": float,
"npr": float,
"skw": float,
"std": float,
"sum": float,
"var": float
}
},
"pre-LB": {
Expand Down Expand Up @@ -226,6 +252,19 @@ def _get_valid_schema(self) -> Schema:
"sum": float,
"var": float
},
Optional("Object_strategy_specific_load_modeled"): {
"avg": float,
"car": float,
"imb": float,
"kur": float,
"max": float,
"min": float,
"npr": float,
"skw": float,
"std": float,
"sum": float,
"var": float
},
"Rank_comm": {
"avg": float,
"car": float,
Expand Down Expand Up @@ -264,6 +303,19 @@ def _get_valid_schema(self) -> Schema:
"std": float,
"sum": float,
"var": float
},
Optional("Rank_strategy_specific_load_modeled"): {
"avg": float,
"car": float,
"imb": float,
"kur": float,
"max": float,
"min": float,
"npr": float,
"skw": float,
"std": float,
"sum": float,
"var": float
}
}
},
Expand Down
9 changes: 0 additions & 9 deletions src/vt/elm/elm_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,6 @@ using CommMapType = std::unordered_map<CommKeyType,CommVolume>;

namespace std {

template <>
struct hash<vt::elm::CommCategory> {
size_t operator()(vt::elm::CommCategory const& in) const {
using LBUnderType = std::underlying_type<vt::elm::CommCategory>::type;
auto const val = static_cast<LBUnderType>(in);
return std::hash<LBUnderType>()(val);
}
};

template <>
struct hash<vt::elm::CommKey> {
size_t operator()(vt::elm::CommKey const& in) const {
Expand Down
14 changes: 0 additions & 14 deletions src/vt/utils/memory/memory_units.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,4 @@ std::tuple<std::string, double> getBestMemoryUnit(std::size_t bytes);

}}} /* end namespace vt::util::memory */

namespace std {

template <>
struct hash<vt::util::memory::MemoryUnitEnum> {
size_t operator()(vt::util::memory::MemoryUnitEnum const& in) const {
using MemoryUnitUnderType =
std::underlying_type<vt::util::memory::MemoryUnitEnum>::type;
auto const val = static_cast<MemoryUnitUnderType>(in);
return std::hash<MemoryUnitUnderType>()(val);
}
};

} /* end namespace std */

#endif /*INCLUDED_VT_UTILS_MEMORY_MEMORY_UNITS_H*/
6 changes: 6 additions & 0 deletions src/vt/vrt/collection/balance/baselb/baselb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,12 @@ void BaseLB::recvSharedEdges(CommMsg* msg) {
}
}

void BaseLB::setStrategySpecificModel(
std::shared_ptr<balance::LoadModel> model
) {
theLBManager()->setStrategySpecificModel(model);
}

}}}} /* end namespace vt::vrt::collection::lb */

#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_BASELB_BASELB_CC*/
6 changes: 4 additions & 2 deletions src/vt/vrt/collection/balance/baselb/baselb.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ struct BaseLB {
protected:
void getArgs(PhaseType phase);

protected:
double start_time_ = 0.0f;
ElementCommType const* comm_data = nullptr;
objgroup::proxy::Proxy<BaseLB> proxy_ = {};
Expand All @@ -161,7 +160,6 @@ struct BaseLB {
balance::LoadModel* load_model_ = nullptr;
bool comm_aware_ = false;

protected:
/**
* \brief Normalizes the reassignment graph by setting up in/out edges on both
* sides regardless of how they are passed to \c migrateObjectTo
Expand All @@ -170,6 +168,10 @@ struct BaseLB {
*/
std::shared_ptr<const balance::Reassignment> normalizeReassignments();

static void setStrategySpecificModel(
std::shared_ptr<balance::LoadModel> model
);

private:
TransferVecType transfers_ = {};
TransferType off_node_migrate_ = {};
Expand Down
13 changes: 0 additions & 13 deletions src/vt/vrt/collection/balance/greedylb/greedylb.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,4 @@ struct GreedyLB : LoadSamplerBaseLB {

}}}} /* end namespace vt::vrt::collection::lb */

namespace std {

template <>
struct hash<vt::vrt::collection::lb::DataDistStrategy> {
size_t operator()(vt::vrt::collection::lb::DataDistStrategy const& in) const {
using UnderType =
std::underlying_type<vt::vrt::collection::lb::DataDistStrategy>::type;
return std::hash<UnderType>()(static_cast<UnderType>(in));
}
};

} /* end namespace std */

#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_GREEDYLB_GREEDYLB_H*/
2 changes: 1 addition & 1 deletion src/vt/vrt/collection/balance/lb_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ nlohmann::json jsonifyPhaseStatistics(const StatisticMap &statistics) {
nlohmann::json j;

for (auto &entry : statistics) {
auto &name = get_lb_stat_name()[entry.first];
auto &name = get_lb_stat_names()[entry.first];
nlohmann::json &this_stat = j[name];
for (auto &quant : entry.second) {
const nlohmann::json quant_name = quant.first;
Expand Down
26 changes: 3 additions & 23 deletions src/vt/vrt/collection/balance/lb_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ enum struct StatisticQuantity : int8_t {
};

enum struct Statistic : int8_t {
Rank_load_modeled, Rank_load_raw, Rank_comm, Rank_work_modeled,
Object_load_modeled, Object_load_raw, Object_comm, Object_work_modeled,
Rank_load_modeled, Rank_load_raw, Rank_comm, Rank_strategy_specific_load_modeled,
Object_load_modeled, Object_load_raw, Object_comm, Object_strategy_specific_load_modeled,
// W_l_min, W_l_max, W_l_avg, W_l_std, W_l_var, W_l_skewness, W_l_kurtosis,
// W_c_min, W_c_max, W_c_avg, W_c_std, W_c_var, W_c_skewness, W_c_kurtosis,
// W_t_min, W_t_max, W_t_avg, W_t_std, W_t_var, W_t_skewness, W_t_kurtosis,
Expand All @@ -197,27 +197,7 @@ using StatisticMap = std::unordered_map<Statistic, StatisticQuantityMap>;

nlohmann::json jsonifyPhaseStatistics(const StatisticMap &statistics);

} /* end namespace lb */

}}} /* end namespace vt::vrt::collection */

namespace std {

template <>
struct hash<vt::vrt::collection::lb::Statistic> {
size_t operator()(vt::vrt::collection::lb::Statistic const& in) const {
using StatisticUnderType =
std::underlying_type<vt::vrt::collection::lb::Statistic>::type;
auto const val = static_cast<StatisticUnderType>(in);
return std::hash<StatisticUnderType>()(val);
}
};

} /* end namespace std */

namespace vt { namespace vrt { namespace collection { namespace lb {

std::unordered_map<Statistic, std::string>& get_lb_stat_name();
std::unordered_map<Statistic, std::string>& get_lb_stat_names();

}}}} /* end namespace vt::vrt::collection::lb */

Expand Down
47 changes: 32 additions & 15 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -215,19 +215,17 @@ void LBManager::defaultPostLBWork(ReassignmentMsg* msg) {
}

void
LBManager::runLB(
LBProxyType base_proxy, PhaseType phase, vt::Callback<ReassignmentMsg> cb
) {
LBManager::runLB(PhaseType phase, vt::Callback<ReassignmentMsg> cb) {
runInEpochCollective("LBManager::runLB -> updateLoads", [=] {
model_->updateLoads(phase);
});

auto base_proxy = lb_instances_["chosen"];
lb::BaseLB* strat = base_proxy.get();
auto proxy = lb_instances_["chosen"];
if (strat->isCommAware()) {
runInEpochCollective(
"LBManager::runLB -> makeGraphSymmetric",
[phase, proxy] { makeGraphSymmetric(phase, proxy); }
[phase, base_proxy] { makeGraphSymmetric(phase, base_proxy); }
);
}

Expand Down Expand Up @@ -342,8 +340,7 @@ void LBManager::startLB(
break;
}

LBProxyType base_proxy = lb_instances_["chosen"];
runLB(base_proxy, phase, cb);
runLB(phase, cb);
}

/*static*/
Expand Down Expand Up @@ -532,7 +529,7 @@ void LBManager::statsHandler(StatsMsgType* msg) {
" max={:.2f}, min={:.2f}, sum={:.2f}, avg={:.2f}, var={:.2f},"
" stdev={:.2f}, nproc={}, cardinality={} skewness={:.2f}, kurtosis={:.2f},"
" npr={}, imb={:.2f}, num_stats={}\n",
lb::get_lb_stat_name()[stat],
lb::get_lb_stat_names()[stat],
max, min, sum, avg, var, stdv, npr, car, skew, krte, npr, imb,
stats.size()
);
Expand Down Expand Up @@ -626,14 +623,14 @@ void LBManager::computeStatistics(
"computeStatistics\n"
);

using ReduceOp = collective::PlusOp<std::vector<balance::LoadData>>;
const balance::PhaseOffset when = {
balance::PhaseOffset::NEXT_PHASE, balance::PhaseOffset::WHOLE_PHASE
};

total_load_from_model = 0.;
std::vector<balance::LoadData> obj_load_model;
for (auto elm : *model) {
auto work = model->getModeledLoad(
elm, {balance::PhaseOffset::NEXT_PHASE, balance::PhaseOffset::WHOLE_PHASE}
);
auto work = model->getModeledLoad(elm, when);
obj_load_model.emplace_back(
LoadData{lb::Statistic::Object_load_modeled, work}
);
Expand All @@ -644,9 +641,7 @@ void LBManager::computeStatistics(
std::vector<balance::LoadData> obj_load_raw;
if (model->hasRawLoad()) {
for (auto elm : *model) {
auto raw_load = model->getRawLoad(
elm, {balance::PhaseOffset::NEXT_PHASE, balance::PhaseOffset::WHOLE_PHASE}
);
auto raw_load = model->getRawLoad(elm, when);
obj_load_raw.emplace_back(
LoadData{lb::Statistic::Object_load_raw, raw_load}
);
Expand All @@ -669,6 +664,27 @@ void LBManager::computeStatistics(
lb::Statistic::Object_load_modeled, std::move(obj_load_model)
));

if (strategy_specific_model_) {
auto rank_strat_specific_load = 0.;
std::vector<balance::LoadData> obj_strat_specific_load;
for (auto elm : *strategy_specific_model_) {
auto work = strategy_specific_model_->getModeledLoad(elm, when);
obj_strat_specific_load.emplace_back(
LoadData{lb::Statistic::Object_strategy_specific_load_modeled, work}
);
rank_strat_specific_load += work;
}

lstats.emplace_back(
LoadData{lb::Statistic::Rank_strategy_specific_load_modeled,
rank_strat_specific_load}
);
lstats.emplace_back(reduceVec(
lb::Statistic::Object_strategy_specific_load_modeled,
std::move(obj_strat_specific_load)
));
}

if (model->hasRawLoad()) {
lstats.emplace_back(
LoadData{lb::Statistic::Rank_load_raw, total_load_raw}
Expand Down Expand Up @@ -701,6 +717,7 @@ void LBManager::computeStatistics(
lb::Statistic::Object_comm, std::move(obj_comm)
));

using ReduceOp = collective::PlusOp<std::vector<balance::LoadData>>;
auto msg = makeMessage<StatsMsgType>(std::move(lstats));
proxy_.template reduce<ReduceOp>(msg,cb);
}
Expand Down
16 changes: 9 additions & 7 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ struct LBManager : runtime::component::Component<LBManager> {
using QuantityType = lb::StatisticQuantityMap;
using StatisticMapType = lb::StatisticMap;

friend lb::BaseLB;

/**
* \internal \brief System call to construct a \c LBManager
*/
Expand Down Expand Up @@ -235,15 +237,12 @@ struct LBManager : runtime::component::Component<LBManager> {
protected:
/**
* \internal
* \brief Run the load balancer
* \brief Run the currently chosen load balancer
*
* \param[in] base_proxy the base proxy for the LB
* \param[in] phase the phase
* \param[in] cb the callback for delivering the reassignment
*/
void runLB(
LBProxyType base_proxy, PhaseType phase, vt::Callback<ReassignmentMsg> cb
);
void runLB(PhaseType phase, vt::Callback<ReassignmentMsg> cb);

void defaultPostLBWork(ReassignmentMsg* r);

Expand All @@ -266,7 +265,6 @@ struct LBManager : runtime::component::Component<LBManager> {
private:
bool isCollectiveComm(elm::CommCategory cat) const;

private:
/**
* \internal \brief Create the statistics file
*/
Expand All @@ -277,13 +275,17 @@ struct LBManager : runtime::component::Component<LBManager> {
*/
void closeStatisticsFile();

private:
void setStrategySpecificModel(std::shared_ptr<LoadModel> model) {
strategy_specific_model_ = model;
}

PhaseType cached_phase_ = no_lb_phase;
LBType cached_lb_ = LBType::NoLB;
std::function<void()> destroy_lb_ = nullptr;
objgroup::proxy::Proxy<LBManager> proxy_;
std::shared_ptr<LoadModel> base_model_;
std::shared_ptr<LoadModel> model_;
std::shared_ptr<LoadModel> strategy_specific_model_;
std::unordered_map<std::string, LBProxyType> lb_instances_;
StatisticMapType stats;
TimeType total_load_from_model = 0.;
Expand Down
Loading