Skip to content

Commit

Permalink
#2201: checkpoint of non-breaking changes (documentation and style)
Browse files Browse the repository at this point in the history
  • Loading branch information
ppebay committed Nov 27, 2023
1 parent d7ef89a commit 7c09e94
Showing 1 changed file with 16 additions and 8 deletions.
24 changes: 16 additions & 8 deletions src/vt/vrt/collection/balance/temperedlb/temperedlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -455,9 +455,9 @@ void TemperedLB::inputParams(balance::ConfigEntry* config) {
void TemperedLB::runLB(LoadType total_load) {
bool should_lb = false;

// Compute load statistics
this_load = total_load;
stats = *getStats();

auto const avg = stats.at(lb::Statistic::Rank_load_modeled).at(
lb::StatisticQuantity::avg
);
Expand All @@ -481,10 +481,12 @@ void TemperedLB::runLB(LoadType total_load) {
target_max_load_ = avg;
}

if (avg > 0.0000000001) {
// Use an absolute minimal bound on average load to load-balance
if (avg > 1e-10) {
should_lb = max > (run_temperedlb_tolerance + 1.0) * target_max_load_;
}

// Report statistics from head rank
if (theContext()->getNode() == 0) {
vt_debug_print(
terse, temperedlb,
Expand All @@ -501,6 +503,7 @@ void TemperedLB::runLB(LoadType total_load) {
}
}

// Perform load rebalancing when deemed necessary
if (should_lb) {
doLBStages(imb);
}
Expand Down Expand Up @@ -814,15 +817,16 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {
selected.insert(this_node);
}

// Determine fanout factor capped by number of nodes
auto const fanout = std::min(f_, static_cast<decltype(f_)>(num_nodes - 1));

vt_debug_print(
verbose, temperedlb,
"TemperedLB::propagateRound: trial={}, iter={}, k_max={}, k_cur={}, "
"selected.size()={}, fanout={}\n",
trial_, iter_, k_max_, k_cur, selected.size(), fanout
);

// Iterate over fanout factor
for (int i = 0; i < fanout; i++) {
// This implies full knowledge of all processors
if (selected.size() >= static_cast<size_t>(num_nodes)) {
Expand All @@ -849,6 +853,7 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {

// Send message with load
if (sync) {
// Message in synchronous mode
auto msg = makeMessage<LoadMsgSync>(this_node, load_info_);
if (epoch != no_epoch) {
envelopeSetEpoch(msg->env, epoch);
Expand All @@ -858,6 +863,7 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {
LoadMsgSync, &TemperedLB::propagateIncomingSync
>(msg.get());
} else {
// Message in asynchronous mode
auto msg = makeMessage<LoadMsgAsync>(this_node, load_info_, k_cur);
if (epoch != no_epoch) {
envelopeSetEpoch(msg->env, epoch);
Expand Down Expand Up @@ -1216,8 +1222,10 @@ std::vector<TemperedLB::ObjIDType> TemperedLB::orderObjects(
void TemperedLB::decide() {
auto lazy_epoch = theTerm()->makeEpochCollective("TemperedLB: decide");

// Initialize transfer and rejection counters
int n_transfers = 0, n_rejected = 0;

// Try to migrate objects only from overloaded objects
if (is_overloaded_) {
std::vector<NodeType> under = makeUnderloaded();
std::unordered_map<NodeType, ObjsType> migrate_objs;
Expand Down Expand Up @@ -1250,6 +1258,7 @@ void TemperedLB::decide() {
}
// Rebuild the CMF with the new loads taken into account
auto cmf = createCMF(under);

// Select a node using the CMF
auto const selected_node = sampleFromCMF(under, cmf);

Expand All @@ -1259,16 +1268,15 @@ void TemperedLB::decide() {
selected_node, load_info_.size()
);

// Find load of selected node
auto load_iter = load_info_.find(selected_node);
vtAssert(load_iter != load_info_.end(), "Selected node not found");

// The load of the node selected
auto& selected_load = load_iter->second;

// Evaluate criterion for proposed transfer
bool eval = Criterion(criterion_)(
this_new_load_, selected_load, obj_load, target_max_load_
);

vt_debug_print(
verbose, temperedlb,
"TemperedLB::decide: trial={}, iter={}, under.size()={}, "
Expand All @@ -1288,9 +1296,10 @@ void TemperedLB::decide() {
eval
);

// Decide about proposed migration based on criterion evaluation
if (eval) {
++n_transfers;
// transfer the object load in seconds
// Transfer the object load in seconds
// to match the object load units on the receiving end
migrate_objs[selected_node][obj_id] = obj_load;

Expand All @@ -1315,7 +1324,6 @@ void TemperedLB::decide() {
auto node = migration.first;
lazyMigrateObjsTo(lazy_epoch, node, migration.second);
}

} else {
// do nothing (underloaded-based algorithm), waits to get work from
// overloaded nodes
Expand Down

0 comments on commit 7c09e94

Please sign in to comment.