Skip to content

Commit

Permalink
#2382: ccm-lb: add max iter time and cycle lock count
Browse files Browse the repository at this point in the history
  • Loading branch information
lifflander committed Dec 19, 2024
1 parent ccb3bb1 commit b07301e
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 5 deletions.
25 changes: 20 additions & 5 deletions src/vt/vrt/collection/balance/temperedlb/temperedlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,7 @@ void TemperedLB::doLBStages(LoadType start_imb) {

for (iter_ = 0; iter_ < num_iters_; iter_++) {
bool first_iter = iter_ == 0;
iter_time_ = MPI_Wtime();

if (first_iter) {
// Copy this node's object assignments to a local, mutable copy
Expand Down Expand Up @@ -1178,6 +1179,7 @@ void TemperedLB::doLBStages(LoadType start_imb) {
is_overloaded_ = is_underloaded_ = false;
ready_to_satisfy_locks_ = false;
other_rank_clusters_.clear();
cycle_locks_ = 0;

// Not clearing shared_block_size_ because this never changes and
// the knowledge might be useful
Expand Down Expand Up @@ -1408,7 +1410,7 @@ void TemperedLB::loadStatsHandler(std::vector<balance::LoadData> const& vec) {
}

void TemperedLB::rejectionStatsHandler(
int n_rejected, int n_transfers, int n_unhomed_blocks
int n_rejected, int n_transfers, int n_unhomed_blocks, int cycle_locks
) {
double rej = static_cast<double>(n_rejected) /
static_cast<double>(n_rejected + n_transfers) * 100.0;
Expand All @@ -1419,8 +1421,18 @@ void TemperedLB::rejectionStatsHandler(
terse, temperedlb,
"TemperedLB::rejectionStatsHandler: n_transfers={} n_unhomed_blocks={}"
" n_rejected={} "
"rejection_rate={:0.1f}%\n",
n_transfers, n_unhomed_blocks, n_rejected, rej
"rejection_rate={:0.1f}%, total_cycle_locks={}\n",
n_transfers, n_unhomed_blocks, n_rejected, rej, cycle_locks
);
}
}

void TemperedLB::maxIterTime(double max_iter_time) {
auto this_node = theContext()->getNode();
if (this_node == 0) {
vt_debug_print(
terse, temperedlb,
"TemperedLB::maxIterTime: {}\n", max_iter_time
);
}
}
Expand Down Expand Up @@ -2136,8 +2148,9 @@ void TemperedLB::originalTransfer() {
// compute rejection rate because it will be printed
runInEpochCollective("TemperedLB::originalTransfer -> compute rejection", [=] {
proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>(
n_rejected, n_transfers, 0
n_rejected, n_transfers, 0, 0
);
proxy_.allreduce<&TemperedLB::maxIterTime, collective::MaxOp>(iter_time_);
});
}
}
Expand Down Expand Up @@ -2525,6 +2538,7 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) {
};

if (is_locked_ && locking_rank_ <= msg->locked_node) {
cycle_locks_++;
proxy_[msg->locked_node].template send<&TemperedLB::releaseLock>();
theTerm()->consume(cur_epoch);
try_locks_.emplace(msg->locked_node, msg->locked_c_try, 1);
Expand Down Expand Up @@ -2708,8 +2722,9 @@ void TemperedLB::swapClusters() {
auto remote_block_count = getRemoteBlockCountHere();
runInEpochCollective("TemperedLB::swapClusters -> compute rejection", [=] {
proxy_.allreduce<&TemperedLB::rejectionStatsHandler, collective::PlusOp>(
n_rejected, n_transfers_swap_, remote_block_count
n_rejected, n_transfers_swap_, remote_block_count, cycle_locks_, iter_time_
);
proxy_.allreduce<&TemperedLB::maxIterTime, collective::MaxOp>(iter_time_);
});
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/vt/vrt/collection/balance/temperedlb/temperedlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ struct TemperedLB : BaseLB {
void rejectionStatsHandler(
int n_rejected, int n_transfers, int n_unhomed_blocks
);
void maxIterTime(double max_iter_time);
void remoteBlockCountHandler(int n_unhomed_blocks);
void thunkMigrations();

Expand Down Expand Up @@ -424,6 +425,8 @@ struct TemperedLB : BaseLB {
StatisticMapType stats;
LoadType this_load = 0.0f;
LoadType this_work = 0.0f;
int cycle_locks_ = 0;
double iter_time_ = 0.0f;
/// Whether any node has communication data
bool has_comm_any_ = false;

Expand Down

0 comments on commit b07301e

Please sign in to comment.