From 45518fd3018deb11cf7882ce09907a02ba3b2cc8 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Wed, 11 Sep 2024 14:00:14 -0400 Subject: [PATCH] #2201: loosen strict inequalities for criterion; remove epsilon from computeWork --- .../collection/balance/temperedlb/criterion.h | 2 +- .../balance/temperedlb/temperedlb.cc | 18 +- .../synthetic-dataset-blocks.0.json | 160 +++++++++++++++++- .../synthetic-dataset-blocks.1.json | 141 ++++++++++++++- .../synthetic-dataset-blocks.2.json | 52 +++++- tests/unit/lb/test_temperedlb.cc | 13 +- 6 files changed, 370 insertions(+), 16 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/criterion.h b/src/vt/vrt/collection/balance/temperedlb/criterion.h index 42e8b7befe..dfbc79f380 100644 --- a/src/vt/vrt/collection/balance/temperedlb/criterion.h +++ b/src/vt/vrt/collection/balance/temperedlb/criterion.h @@ -63,7 +63,7 @@ struct GrapevineCriterion { struct ModifiedGrapevineCriterion { bool operator()(LoadType over, LoadType under, LoadType obj, LoadType) const { - return obj < over - under; + return obj <= over - under; } }; diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 73385921a6..e809ec3204 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -193,7 +193,7 @@ Default: Original { "ordering", R"( -Values: {Arbitrary, ElmID, FewestMigrations, SmallObject, LargestObjects} +Values: {Arbitrary, ElmID, FewestMigrations, SmallObjects, LargestObjects} Default: FewestMigrations Description: The order in which local objects are considered for transfer. Options are: @@ -860,8 +860,8 @@ double TemperedLB::computeWork( alpha * load + beta * inter_comm_bytes + gamma * intra_comm_bytes + - delta * shared_comm_bytes + - epsilon; + delta * shared_comm_bytes; + // epsilon; } WorkBreakdown TemperedLB::computeWorkBreakdown( @@ -1894,7 +1894,7 @@ std::vector TemperedLB::orderObjects( auto single_obj_load = this_new_load; for (auto &obj : cur_objs) { auto obj_load = obj.second; - if (obj_load > over_avg && obj_load < single_obj_load) { + if (obj_load >= over_avg && obj_load < single_obj_load) { single_obj_load = obj_load; } } @@ -2340,7 +2340,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { try_rank, try_info, try_total_bytes, try_max_owm, try_max_osm, src_cluster, empty_cluster ); - if (c_try > 0.0) { + if (c_try >= 0.0) { if (c_try > best_c_try) { best_c_try = c_try; best_swap = std::make_tuple(src_shared_id, no_shared_id); @@ -2358,7 +2358,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { "testing a possible swap (rank {}): {} {} c_try={}\n", try_rank, src_shared_id, try_shared_id, c_try ); - if (c_try > 0.0) { + if (c_try >= 0.0) { if (c_try > best_c_try) { best_c_try = c_try; best_swap = std::make_tuple(src_shared_id, try_shared_id); @@ -2367,7 +2367,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr msg) { } } - if (best_c_try > 0) { + if (best_c_try >= 0) { // FIXME C++20: use structured binding auto const src_shared_id = std::get<0>(best_swap); auto const try_shared_id = std::get<1>(best_swap); @@ -2653,7 +2653,7 @@ void TemperedLB::swapClusters() { { ClusterInfo empty_cluster; double c_try = criterion(try_rank, try_mem, src_cluster, empty_cluster); - if (c_try > 0.0) { + if (c_try >= 0.0) { // Try to obtain lock for feasible swap found_potential_good_swap = true; proxy_[try_rank].template send<&TemperedLB::tryLock>(this_node, c_try); @@ -2665,7 +2665,7 @@ void TemperedLB::swapClusters() { for (auto const& [try_shared_id, try_cluster] : try_clusters) { // Decide whether swap is beneficial double c_try = criterion(try_rank, try_mem, src_cluster, try_cluster); - if (c_try > 0.0) { + if (c_try >= 0.0) { // Try to obtain lock for feasible swap found_potential_good_swap = true; proxy_[try_rank].template send<&TemperedLB::tryLock>(this_node, c_try); diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json index 792f750954..83cd135ae1 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.0.json @@ -1 +1,159 @@ -{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]} +{ + "metadata": { + "rank": 0, + "type": "LBDatafile" + }, + "phases": [ + { + "communications": [ + { + "bytes": 2.0, + "from": { + "collection_id": 7, + "home": 0, + "seq_id": 0, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 1, + "seq_id": 5, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + }, + { + "bytes": 1.0, + "from": { + "collection_id": 7, + "home": 0, + "seq_id": 1, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 1, + "seq_id": 4, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + }, + { + "bytes": 1.0, + "from": { + "collection_id": 7, + "home": 0, + "seq_id": 3, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 0, + "seq_id": 2, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + }, + { + "bytes": 0.5, + "from": { + "collection_id": 7, + "home": 0, + "seq_id": 3, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 2, + "seq_id": 8, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + } + ], + "id": 0, + "tasks": [ + { + "entity": { + "collection_id": 7, + "home": 0, + "seq_id": 1, + "migratable": true, + "type": "object" + }, + "node": 0, + "resource": "cpu", + "time": 0.5, + "user_defined": { + "home_rank": 0, + "shared_bytes": 9.0, + "shared_id": 0 + } + }, + { + "entity": { + "collection_id": 7, + "home": 0, + "seq_id": 3, + "migratable": true, + "type": "object" + }, + "node": 0, + "resource": "cpu", + "time": 0.5, + "user_defined": { + "home_rank": 0, + "shared_bytes": 9.0, + "shared_id": 1 + } + }, + { + "entity": { + "collection_id": 7, + "home": 0, + "seq_id": 2, + "migratable": true, + "type": "object" + }, + "node": 0, + "resource": "cpu", + "time": 0.5, + "user_defined": { + "home_rank": 0, + "shared_bytes": 9.0, + "shared_id": 1 + } + }, + { + "entity": { + "collection_id": 7, + "home": 0, + "seq_id": 0, + "migratable": true, + "type": "object" + }, + "node": 0, + "resource": "cpu", + "time": 1.0, + "user_defined": { + "home_rank": 0, + "shared_bytes": 9.0, + "shared_id": 0 + } + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json index 99f476a215..050fd1b1a5 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.1.json @@ -1 +1,140 @@ -{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]} +{ + "metadata": { + "rank": 1, + "type": "LBDatafile" + }, + "phases": [ + { + "communications": [ + { + "bytes": 2.0, + "from": { + "collection_id": 7, + "home": 1, + "seq_id": 4, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 0, + "seq_id": 1, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + }, + { + "bytes": 2.0, + "from": { + "collection_id": 7, + "home": 1, + "seq_id": 5, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 2, + "seq_id": 8, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + }, + { + "bytes": 1.0, + "from": { + "collection_id": 7, + "home": 1, + "seq_id": 7, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 1, + "seq_id": 6, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + } + ], + "id": 0, + "tasks": [ + { + "entity": { + "collection_id": 7, + "home": 1, + "seq_id": 5, + "migratable": true, + "type": "object" + }, + "node": 1, + "resource": "cpu", + "time": 2.0, + "user_defined": { + "home_rank": 1, + "shared_bytes": 9.0, + "shared_id": 2 + } + }, + { + "entity": { + "collection_id": 7, + "home": 1, + "seq_id": 4, + "migratable": true, + "type": "object" + }, + "node": 1, + "resource": "cpu", + "time": 0.5, + "user_defined": { + "home_rank": 1, + "shared_bytes": 9.0, + "shared_id": 2 + } + }, + { + "entity": { + "collection_id": 7, + "home": 1, + "seq_id": 7, + "migratable": true, + "type": "object" + }, + "node": 1, + "resource": "cpu", + "time": 0.5, + "user_defined": { + "home_rank": 1, + "shared_bytes": 9.0, + "shared_id": 3 + } + }, + { + "entity": { + "collection_id": 7, + "home": 1, + "seq_id": 6, + "migratable": true, + "type": "object" + }, + "node": 1, + "resource": "cpu", + "time": 1.0, + "user_defined": { + "home_rank": 1, + "shared_bytes": 9.0, + "shared_id": 3 + } + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json b/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json index a09b3dba90..39dfa10e8c 100644 --- a/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json +++ b/tests/data/synthetic-blocks/synthetic-dataset-blocks.2.json @@ -1 +1,51 @@ -{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"collection_id":7,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]} +{ + "metadata": { + "rank": 2, + "type": "LBDatafile" + }, + "phases": [ + { + "communications": [ + { + "bytes": 1.5, + "from": { + "collection_id": 7, + "home": 2, + "seq_id": 8, + "migratable": true, + "type": "object" + }, + "messages": 1, + "to": { + "collection_id": 7, + "home": 1, + "seq_id": 6, + "migratable": true, + "type": "object" + }, + "type": "SendRecv" + } + ], + "id": 0, + "tasks": [ + { + "entity": { + "collection_id": 7, + "home": 2, + "seq_id": 8, + "migratable": true, + "type": "object" + }, + "node": 2, + "resource": "cpu", + "time": 1.5, + "user_defined": { + "home_rank": 2, + "shared_bytes": 9.0, + "shared_id": 4 + } + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/unit/lb/test_temperedlb.cc b/tests/unit/lb/test_temperedlb.cc index 7496df99a9..f8427e0b5a 100644 --- a/tests/unit/lb/test_temperedlb.cc +++ b/tests/unit/lb/test_temperedlb.cc @@ -63,16 +63,23 @@ void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) { EXPECT_EQ(phase_info->imb_load_post_lb, expected_imb); } -TEST_F(TestTemperedLB, test_load_only) { +TEST_F(TestTemperedLB, test_load_only_original_transfer) { SET_NUM_NODES_CONSTRAINT(4); - auto cfg = writeTemperedLBConfig("SwapClusters", false); + auto cfg = writeTemperedLBConfig("Original", false); runTemperedLBTest(cfg); } +TEST_F(TestTemperedLB, test_load_only_swapclusters) { + SET_NUM_NODES_CONSTRAINT(4); + auto cfg = writeTemperedLBConfig("SwapClusters", false); + // Expect 0.25 in this case because vt does not subcluster + runTemperedLBTest(cfg, 0.25); +} + TEST_F(TestTemperedLB, test_load_and_memory_swapclusters) { SET_NUM_NODES_CONSTRAINT(4); auto cfg = writeTemperedLBConfig("SwapClusters", true); - runTemperedLBTest(cfg); + runTemperedLBTest(cfg, 0.25); } TEST_F(TestTemperedLB, test_load_no_memory_delta_10) {