Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1265 lb data replay without collection #1720

Merged
merged 41 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
206b0ca
#1265: config: add debugging for replay
nlslatt Feb 24, 2022
77c3b7c
#1265: replay: add collectionless replay capability
nlslatt Feb 24, 2022
f5a58cf
#1265: replay: make output more user-friendly
nlslatt Mar 7, 2022
9cb6cd0
#1265: replay: clean up and make more self-documenting
nlslatt Mar 7, 2022
9af4c30
#1265: replay: fix crash when no lb selected
nlslatt Mar 7, 2022
35020fe
#1265: replay: move driver into tools
nlslatt Mar 9, 2022
c820025
#1265: replay: improve debugging
nlslatt Mar 22, 2022
02d09d7
#1265: replay: reduce redundant code
nlslatt Mar 22, 2022
f59499a
#1265: tests: first set of tests for workload replay
nlslatt Mar 7, 2022
6b2b534
#1265: tests: additional tests of workload replay
nlslatt Mar 23, 2022
0405dca
#1265: replay: refactor for better testing
nlslatt Mar 23, 2022
2d50760
#1265: replay: strike all refs to stats except filenames
nlslatt Mar 23, 2022
e1800e3
#1265: replay: updated filenames to not ref stats
nlslatt Mar 23, 2022
10115a3
#1265: tests: reduce redundant code
nlslatt Mar 23, 2022
bfc3a0c
#1265: replay: refactor to improve readability
nlslatt Mar 23, 2022
31cea61
#1265: tests: leverage refactor in testing
nlslatt Mar 23, 2022
35f05da
#1265: replay: clean up code
nlslatt Mar 23, 2022
b693bbe
#1265: tests: add more replay tests
nlslatt Mar 23, 2022
3f07a72
#1265: replay: update license headers
nlslatt Mar 25, 2022
39112db
#1265: replay: add doxygen
nlslatt Mar 25, 2022
9144b7b
#1265: tests: add subphases to workload migrator test
nlslatt Mar 25, 2022
34e77ff
#1265: replay: allow in-memory testing
nlslatt Mar 25, 2022
cb7d762
#1265: tests: run replay without verifying
nlslatt Mar 25, 2022
3839d30
#1265: replay: clarify usage in tool
nlslatt Mar 25, 2022
3ec982e
#1265: replay: add replay namespace
nlslatt Apr 12, 2022
a74cfc9
#1265: replay: clean up code
nlslatt Apr 12, 2022
b24acee
#1265: replay: update to use lb callbacks
nlslatt Apr 13, 2022
d124d63
#1265: replay: update to reflect stats renaming
nlslatt May 2, 2022
dafa15d
#1265: replay: allow custom stats callback for testing
nlslatt May 9, 2022
b9c9263
#1265: tests: verify statistics under replay
nlslatt May 9, 2022
c896627
#1265: replay: fix doxygen
nlslatt May 9, 2022
3af9b7b
#1265: replay: make compatibility updates
nlslatt Nov 16, 2022
d1b2194
#1265: tests: make compatibility updates
nlslatt Oct 17, 2023
3f7c972
#1265: replay: make compatibility updates
nlslatt Oct 17, 2023
d9e0f14
#1265: tests: make compatibility updates
nlslatt Oct 17, 2023
245e27b
#1265: phase manager: use phase passed from lb manager
nlslatt Oct 18, 2023
ae5f091
#1265: lb manager: add accessors needed for statistics computation
nlslatt Oct 18, 2023
638bc0b
#1265: replay: print phase summary when simulating phase
nlslatt Oct 18, 2023
64dd099
#1265: build: make tools build option lowercase
nlslatt Nov 7, 2023
43b7755
#1265: build: remove testing comment from macro
nlslatt Nov 7, 2023
794299d
#1265: replay: remove unnecessary barrier
nlslatt Nov 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ set(PROJECT_BIN_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(PROJECT_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PROJECT_LIB_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib)
set(PROJECT_EXAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/examples)
set(PROJECT_TOOLS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools)

# Import the linking macros for VT-related targets
include(cmake/link_vt.cmake)
Expand Down Expand Up @@ -114,6 +115,25 @@ if (VT_BUILD_TESTS
include(CTest)
endif()

#
nlslatt marked this conversation as resolved.
Show resolved Hide resolved
# Tools
#
option(vt_build_tools "Build VT tools" ON)

if (vt_build_tools)
message(
STATUS
"VT: building tools"
)

add_custom_target(tools)
add_subdirectory(tools)
else()
message(
STATUS "VT: NOT building tools because vt_build_tools is not set."
)
endif()

#
# Examples
#
Expand Down
2 changes: 1 addition & 1 deletion scripts/check_license.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
path_to_vt=${1}
cd "$path_to_vt" || exit 1

for sub_dir in "src" "tests/unit" "tests/perf" "tutorial" "examples"
for sub_dir in "src" "tests/unit" "tests/perf" "tutorial" "examples" "tools"
do
"$path_to_vt/scripts/add-license-perl.pl" "$path_to_vt/$sub_dir" "$path_to_vt/scripts/license-template"
done
Expand Down
2 changes: 2 additions & 0 deletions src/vt/configs/arguments/app_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ struct AppConfig {
bool vt_debug_phase = false;
bool vt_debug_context = false;
bool vt_debug_epoch = false;
bool vt_debug_replay = false;

bool vt_debug_print_flush = false;

Expand Down Expand Up @@ -386,6 +387,7 @@ struct AppConfig {
| vt_debug_phase
| vt_debug_context
| vt_debug_epoch
| vt_debug_replay

| vt_debug_print_flush

Expand Down
3 changes: 3 additions & 0 deletions src/vt/configs/arguments/args.cc
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) {
auto dcp = "Enable debug_phase = \"" debug_pp(phase) "\"";
auto ddp = "Enable debug_context = \"" debug_pp(context) "\"";
auto dep = "Enable debug_epoch = \"" debug_pp(epoch) "\"";
auto dfp = "Enable debug_replay = \"" debug_pp(replay) "\"";

auto r1 = app.add_option("--vt_debug_level", appConfig.vt_debug_level, rq);

Expand Down Expand Up @@ -410,6 +411,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) {
auto dc = app.add_flag("--vt_debug_phase", appConfig.vt_debug_phase, dcp);
auto dd = app.add_flag("--vt_debug_context", appConfig.vt_debug_context, ddp);
auto de = app.add_flag("--vt_debug_epoch", appConfig.vt_debug_epoch, dep);
auto df = app.add_flag("--vt_debug_replay", appConfig.vt_debug_replay, dfp);

auto debugGroup = "Debug Print Configuration (must be compile-time enabled)";
r->group(debugGroup);
Expand Down Expand Up @@ -446,6 +448,7 @@ void addDebugPrintArgs(CLI::App& app, AppConfig& appConfig) {
dc->group(debugGroup);
dd->group(debugGroup);
de->group(debugGroup);
df->group(debugGroup);

auto dbq = "Always flush VT runtime prints";
auto eb = app.add_flag("--vt_debug_print_flush", appConfig.vt_debug_print_flush, dbq);
Expand Down
4 changes: 3 additions & 1 deletion src/vt/configs/debug/debug_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ enum CatEnum : uint64_t {
phase = 1ull<<28,
context = 1ull<<29,
epoch = 1ull<<30,
temperedwmin = 1ull<<31
temperedwmin = 1ull<<31,
replay = 1ull<<32
};

enum CtxEnum : uint64_t {
Expand Down Expand Up @@ -138,6 +139,7 @@ vt_option_category_pretty_print(reduce, "reduce")
vt_option_category_pretty_print(rdma, "RDMA")
vt_option_category_pretty_print(rdma_channel, "RDMA Channel")
vt_option_category_pretty_print(rdma_state, "RDMA State")
vt_option_category_pretty_print(replay, "replay")
vt_option_category_pretty_print(runtime, "runtime")
vt_option_category_pretty_print(scatter, "scatter")
vt_option_category_pretty_print(serial_msg, "serialized-msg")
Expand Down
8 changes: 4 additions & 4 deletions src/vt/phase/phase_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info)
phase,
"phase={}, duration={}, rank_max_compute_time={}, rank_avg_compute_time={}, imbalance={:.3f}, "
"grain_max_time={}, migration count={}, lb_name={}\n",
cur_phase_,
last_phase_info->phase,
total_time,
TimeType(last_phase_info->max_load),
TimeType(last_phase_info->avg_load),
Expand All @@ -313,7 +313,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info)
// vt_print(
// phase,
// "POST phase={}, total time={}, max_load={}, avg_load={}, imbalance={:.3f}, migration count={}\n",
// cur_phase_,
// last_phase_info->phase,
// total_time,
// TimeType(last_phase_info->max_load_post_lb),
// TimeType(last_phase_info->avg_load_post_lb),
Expand All @@ -336,7 +336,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info)
auto percent_improvement = compute_percent_improvement(
last_phase_info->max_load, last_phase_info->avg_load
);
if (percent_improvement > 3.0 and cur_phase_ > 0) {
if (percent_improvement > 3.0 and last_phase_info->phase > 0) {
if (grain_percent_improvement < 0.5) {
// grain size is blocking improvement
vt_print(
Expand Down Expand Up @@ -395,7 +395,7 @@ void PhaseManager::printSummary(vrt::collection::lb::PhaseInfo* last_phase_info)
}
}
}
} else if (cur_phase_ == 0) {
} else if (last_phase_info->phase == 0) {
// ran the lb on a phase that may have included initialization costs
vt_print(
phase,
Expand Down
1 change: 1 addition & 0 deletions src/vt/runtime/runtime_banner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ void Runtime::printStartupBanner() {
vt_runtime_debug_warn_compile(phase)
vt_runtime_debug_warn_compile(context)
vt_runtime_debug_warn_compile(epoch)
vt_runtime_debug_warn_compile(replay)

auto arg_str = [](std::vector<char*> const& args) -> std::string {
std::stringstream ss;
Expand Down
4 changes: 4 additions & 0 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ struct LBManager : runtime::component::Component<LBManager> {

void statsHandler(std::vector<balance::LoadData> const& in_stat_vec);

lb::PhaseInfo *getPhaseInfo() { return last_phase_info_.get(); }

void setComputingBeforeLBStats(bool before_lb) { before_lb_stats_ = before_lb; }

private:
bool isCollectiveComm(elm::CommCategory cat) const;

Expand Down
Loading
Loading