From 23bd9c31a1a858d829c6337ff0cbdfa2a64332b2 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Thu, 5 Sep 2024 09:25:54 -0400 Subject: [PATCH] #2201: wip: fix review comments; add collection_id to synthetic data --- .../balance/temperedlb/temperedlb.cc | 57 ++++++++++++------- .../balance/temperedlb/temperedlb.h | 2 +- .../synthetic-dataset-blocks.0.json | 2 +- .../synthetic-dataset-blocks.1.json | 2 +- .../synthetic-dataset-blocks.2.json | 2 +- tests/unit/lb/test_temperedlb.cc | 7 +-- 6 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index a26e8ee362..73385921a6 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -257,6 +257,7 @@ Default: true Description: If the final iteration of a trial has a worse imbalance than any earlier iteration, it will roll back to the iteration with the best imbalance. + If transfer_strategy is SwapClusters, rollback is automatically set to false. )" }, { @@ -291,32 +292,32 @@ Description: α in the work model (load in work model) "beta", R"( Values: -Defaut: 1.0 +Defaut: 0.0 Description: β in the work model (inter-node communication in work model) )" }, { - "epsilon", + "gamma", R"( Values: -Defaut: 1.0 -Description: ε in the work model (memory term in work model) +Defaut: 0.0 +Description: γ in the work model (intra-node communication in work model) )" }, { "delta", R"( Values: -Defaut: 1.0 +Defaut: 0.0 Description: δ in the work model (shared-memory-edges in work model) )" }, { - "gamma", + "epsilon", R"( Values: -Defaut: 1.0 -Description: γ in the work model (intra-node communication in work model) +Defaut: infinity +Description: ε in the work model (memory term in work model) )" } }; @@ -456,6 +457,10 @@ void TemperedLB::inputParams(balance::ConfigEntry* config) { ); transfer_type_ = transfer_type_converter_.getFromConfig(config, transfer_type_); + if (transfer_type_ == TransferTypeEnum::SwapClusters) { + rollback_ = false; + } + balance::LBArgsEnumConverter obj_ordering_converter_( "ordering", "ObjectOrderEnum", { {ObjectOrderEnum::Arbitrary, "Arbitrary"}, @@ -1066,10 +1071,22 @@ void TemperedLB::doLBStages(LoadType start_imb) { if (first_iter) { // Copy this node's object assignments to a local, mutable copy cur_objs_.clear(); + int total_num_objs = 0; + int num_migratable_objs = 0; for (auto obj : *load_model_) { - cur_objs_[obj] = getModeledValue(obj); + total_num_objs++; + if (obj.isMigratable()) { + num_migratable_objs++; + cur_objs_[obj] = getModeledValue(obj); + } } + vt_debug_print( + normal, temperedlb, + "TemperedLB::doLBStages: Found {} migratable objects out of {}.\n", + num_migratable_objs, total_num_objs + ); + send_edges_.clear(); recv_edges_.clear(); bool has_comm = false; @@ -1326,12 +1343,14 @@ void TemperedLB::doLBStages(LoadType start_imb) { ); } - auto remote_block_count = getRemoteBlockCountHere(); - runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] { - proxy_.allreduce<&TemperedLB::remoteBlockCountHandler, collective::PlusOp>( - remote_block_count - ); - }); + // Skip this block when not using SwapClusters + if (transfer_type_ == TransferTypeEnum::SwapClusters) { + auto remote_block_count = getRemoteBlockCountHere(); + runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] { + proxy_.allreduce<&TemperedLB::remoteBlockCountHandler, + collective::PlusOp>(remote_block_count); + }); + } } else if (this_node == 0) { vt_debug_print( terse, temperedlb, @@ -2269,7 +2288,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { try_new_mem += src_cluster.cluster_footprint; if (try_new_mem > mem_thresh_) { - return - std::numeric_limits::infinity(); + return - epsilon; } BytesType src_new_mem = current_memory_usage_; @@ -2289,7 +2308,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { src_new_mem -= src_cluster.cluster_footprint; if (src_new_mem > mem_thresh_) { - return - std::numeric_limits::infinity(); + return - epsilon; } double const src_new_work = @@ -2596,12 +2615,12 @@ void TemperedLB::swapClusters() { // Necessary but not sufficient check regarding memory bounds if (try_mem - try_cluster.bytes + src_cluster.bytes > mem_thresh_) { - return - std::numeric_limits::infinity(); + return - epsilon; } auto const src_mem = current_memory_usage_; if (src_mem + try_cluster.bytes - src_cluster.bytes > mem_thresh_) { - return - std::numeric_limits::infinity(); + return - epsilon; } auto const& try_info = load_info_.find(try_rank)->second; diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index b30cd66499..f28084973f 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -417,7 +417,7 @@ struct TemperedLB : BaseLB { double beta = 0.0; double gamma = 0.0; double delta = 0.0; - double epsilon = 0.0; + double epsilon = std::numeric_limits::infinity(); std::vector propagated_k_; std::mt19937 gen_propagate_; std::mt19937 gen_sample_; diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json index 26afaaea2c..792f750954 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json @@ -1 +1 @@ -{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]} +{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]} diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json index 160cf422d5..99f476a215 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json @@ -1 +1 @@ -{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]} +{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]} diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json index 5b1e88a01d..a09b3dba90 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json @@ -1 +1 @@ -{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]} +{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"collection_id":7,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]} diff --git a/tests/unit/lb/test_temperedlb.cc b/tests/unit/lb/test_temperedlb.cc index 9dbea33901..7496df99a9 100644 --- a/tests/unit/lb/test_temperedlb.cc +++ b/tests/unit/lb/test_temperedlb.cc @@ -27,7 +27,6 @@ std::string writeTemperedLBConfig(std::string transfer_strategy, " gamma=" << gamma << " delta=" << delta; if (transfer_strategy == "SwapClusters") { - cfg_file_ << " rollback=false"; if (mem_constraints) { cfg_file_ << " memory_threshold=20.0"; } else { @@ -40,6 +39,9 @@ std::string writeTemperedLBConfig(std::string transfer_strategy, } void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) { + // Clear the LB config + vrt::collection::balance::ReadLBConfig::clear(); + // Set configuration theConfig()->vt_lb = true; theConfig()->vt_lb_data_in = true; @@ -59,9 +61,6 @@ void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) { // Assert that temperedLB found the correct imbalance EXPECT_EQ(phase_info->imb_load_post_lb, expected_imb); - - // Clear the LB config ahead of next test - vrt::collection::balance::ReadLBConfig::clear(); } TEST_F(TestTemperedLB, test_load_only) {