Skip to content

Commit

Permalink
#2201: wip: fix review comments; add collection_id to synthetic data
Browse files Browse the repository at this point in the history
  • Loading branch information
cwschilly authored and cz4rs committed Sep 20, 2024
1 parent 63ef499 commit 23bd9c3
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 27 deletions.
57 changes: 38 additions & 19 deletions src/vt/vrt/collection/balance/temperedlb/temperedlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ Default: true
Description:
If the final iteration of a trial has a worse imbalance than any earlier
iteration, it will roll back to the iteration with the best imbalance.
If transfer_strategy is SwapClusters, rollback is automatically set to false.
)"
},
{
Expand Down Expand Up @@ -291,32 +292,32 @@ Description: α in the work model (load in work model)
"beta",
R"(
Values: <double>
Defaut: 1.0
Defaut: 0.0
Description: β in the work model (inter-node communication in work model)
)"
},
{
"epsilon",
"gamma",
R"(
Values: <double>
Defaut: 1.0
Description: ε in the work model (memory term in work model)
Defaut: 0.0
Description: γ in the work model (intra-node communication in work model)
)"
},
{
"delta",
R"(
Values: <double>
Defaut: 1.0
Defaut: 0.0
Description: δ in the work model (shared-memory-edges in work model)
)"
},
{
"gamma",
"epsilon",
R"(
Values: <double>
Defaut: 1.0
Description: γ in the work model (intra-node communication in work model)
Defaut: infinity
Description: ε in the work model (memory term in work model)
)"
}
};
Expand Down Expand Up @@ -456,6 +457,10 @@ void TemperedLB::inputParams(balance::ConfigEntry* config) {
);
transfer_type_ = transfer_type_converter_.getFromConfig(config, transfer_type_);

if (transfer_type_ == TransferTypeEnum::SwapClusters) {
rollback_ = false;
}

balance::LBArgsEnumConverter<ObjectOrderEnum> obj_ordering_converter_(
"ordering", "ObjectOrderEnum", {
{ObjectOrderEnum::Arbitrary, "Arbitrary"},
Expand Down Expand Up @@ -1066,10 +1071,22 @@ void TemperedLB::doLBStages(LoadType start_imb) {
if (first_iter) {
// Copy this node's object assignments to a local, mutable copy
cur_objs_.clear();
int total_num_objs = 0;
int num_migratable_objs = 0;
for (auto obj : *load_model_) {
cur_objs_[obj] = getModeledValue(obj);
total_num_objs++;
if (obj.isMigratable()) {
num_migratable_objs++;
cur_objs_[obj] = getModeledValue(obj);
}
}

vt_debug_print(
normal, temperedlb,
"TemperedLB::doLBStages: Found {} migratable objects out of {}.\n",
num_migratable_objs, total_num_objs
);

send_edges_.clear();
recv_edges_.clear();
bool has_comm = false;
Expand Down Expand Up @@ -1326,12 +1343,14 @@ void TemperedLB::doLBStages(LoadType start_imb) {
);
}

auto remote_block_count = getRemoteBlockCountHere();
runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] {
proxy_.allreduce<&TemperedLB::remoteBlockCountHandler, collective::PlusOp>(
remote_block_count
);
});
// Skip this block when not using SwapClusters
if (transfer_type_ == TransferTypeEnum::SwapClusters) {
auto remote_block_count = getRemoteBlockCountHere();
runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] {
proxy_.allreduce<&TemperedLB::remoteBlockCountHandler,
collective::PlusOp>(remote_block_count);
});
}
} else if (this_node == 0) {
vt_debug_print(
terse, temperedlb,
Expand Down Expand Up @@ -2269,7 +2288,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
try_new_mem += src_cluster.cluster_footprint;

if (try_new_mem > mem_thresh_) {
return - std::numeric_limits<double>::infinity();
return - epsilon;
}

BytesType src_new_mem = current_memory_usage_;
Expand All @@ -2289,7 +2308,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
src_new_mem -= src_cluster.cluster_footprint;

if (src_new_mem > mem_thresh_) {
return - std::numeric_limits<double>::infinity();
return - epsilon;
}

double const src_new_work =
Expand Down Expand Up @@ -2596,12 +2615,12 @@ void TemperedLB::swapClusters() {

// Necessary but not sufficient check regarding memory bounds
if (try_mem - try_cluster.bytes + src_cluster.bytes > mem_thresh_) {
return - std::numeric_limits<double>::infinity();
return - epsilon;
}

auto const src_mem = current_memory_usage_;
if (src_mem + try_cluster.bytes - src_cluster.bytes > mem_thresh_) {
return - std::numeric_limits<double>::infinity();
return - epsilon;
}

auto const& try_info = load_info_.find(try_rank)->second;
Expand Down
2 changes: 1 addition & 1 deletion src/vt/vrt/collection/balance/temperedlb/temperedlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ struct TemperedLB : BaseLB {
double beta = 0.0;
double gamma = 0.0;
double delta = 0.0;
double epsilon = 0.0;
double epsilon = std::numeric_limits<double>::infinity();
std::vector<bool> propagated_k_;
std::mt19937 gen_propagate_;
std::mt19937 gen_sample_;
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]}
{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]}
{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]}
{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"collection_id":7,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]}
7 changes: 3 additions & 4 deletions tests/unit/lb/test_temperedlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ std::string writeTemperedLBConfig(std::string transfer_strategy,
" gamma=" << gamma <<
" delta=" << delta;
if (transfer_strategy == "SwapClusters") {
cfg_file_ << " rollback=false";
if (mem_constraints) {
cfg_file_ << " memory_threshold=20.0";
} else {
Expand All @@ -40,6 +39,9 @@ std::string writeTemperedLBConfig(std::string transfer_strategy,
}

void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) {
// Clear the LB config
vrt::collection::balance::ReadLBConfig::clear();

// Set configuration
theConfig()->vt_lb = true;
theConfig()->vt_lb_data_in = true;
Expand All @@ -59,9 +61,6 @@ void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) {

// Assert that temperedLB found the correct imbalance
EXPECT_EQ(phase_info->imb_load_post_lb, expected_imb);

// Clear the LB config ahead of next test
vrt::collection::balance::ReadLBConfig::clear();
}

TEST_F(TestTemperedLB, test_load_only) {
Expand Down

0 comments on commit 23bd9c3

Please sign in to comment.