From b158af9ba2a36ea5b9beca8ae669401665ec1f78 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Fri, 20 Dec 2024 13:55:54 -0500 Subject: [PATCH] #1: output: only write nodes if no slow ranks were found on it --- detection/detect_slow_nodes.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/detection/detect_slow_nodes.py b/detection/detect_slow_nodes.py index 7c7edcd..3fe13d1 100644 --- a/detection/detect_slow_nodes.py +++ b/detection/detect_slow_nodes.py @@ -18,6 +18,7 @@ def __init__(self, output_filepath, threshold_pct=0.05): # Initialize outliers self.__outlying_nodes = {} + self.__outlying_node_names = [] self.__outlying_iterations = {} # Initialize directories @@ -133,6 +134,9 @@ def __analyze_across_nodes(self): if time in outliers: self.__outlying_nodes[n_id] = time + for node in self.__outlying_nodes.keys(): + self.__outlying_node_names.append(self.__node_to_proc_map[node]) + def __analyze_within_nodes(self): """ Compares the execution of each iteration on a single node to @@ -221,14 +225,17 @@ def create_hostfile(self): Outputs a hostfile that contains a list of all nodes, omitting any slow nodes. """ - good_procs = set([self.__node_to_proc_map[node] for node in self.__node_times.keys() if node not in self.__outlying_nodes.keys()]) + good_node_names = set([ + node_name for node_name in self.__node_to_proc_map.values() + if node_name not in self.__outlying_node_names + ]) hostfile_path = os.path.join(self.__output_dir, "hostfile.txt") with open(hostfile_path, "w") as hostfile: - for proc in good_procs: - hostfile.write(proc + "\n") + for node_name in good_node_names: + hostfile.write(node_name + "\n") - print(f"hostfile with {len(good_procs)} processors has been written to {hostfile_path}") + print(f"hostfile with {len(good_node_names)} processors has been written to {hostfile_path}") def main(): parser = argparse.ArgumentParser(description='Slow Node Detector script.')