From 0a3c1d11d7b8c04ddbc5d07510f6a79a3df2c486 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Nov 2024 12:20:45 +0100 Subject: [PATCH 1/8] add cleaning updates --- sisyphus/cleaner.py | 45 +++++++++++++++++++++++++++++++------ sisyphus/global_settings.py | 3 +++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index fc3a858..48c25cc 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -20,6 +20,8 @@ import sys import tempfile +from typing import List, Optional + from sisyphus import graph import sisyphus.global_settings as gs @@ -206,7 +208,7 @@ def search_for_unused(job_dirs, current=gs.WORK_DIR, verbose=True): return unused -def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", force=False): +def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", force=False, filter_affected=None): """list all directories that will be deleted and add a security check""" if isinstance(dirs, str): dirs = load_remove_list(dirs) @@ -228,7 +230,19 @@ def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", fo if input_var.lower() != "n": logging.info("Affected directories:") for i in tmp: - logging.info(i) + if os.path.exists(i + "/info") and gs.CLEANER_PRINT_ALIAS: + with open(i + "/info") as f: + lines = f.readlines() + if lines[-1].strip().startswith("ALIAS"): + s = lines[-1].strip() + s.replace("ALIAS:", "ALIAS AT CREATION:") + else: + s = "" + else: + s = "" + if filter_affected is None or any(x in i for x in filter_affected): + logging.info(i + " " + s) + else: with tempfile.NamedTemporaryFile(mode="w") as tmp_file: for directory in dirs: @@ -280,8 +294,17 @@ def cleanup_jobs(): job._sis_cleanup() -def cleanup_keep_value(min_keep_value, load_from: str = "", mode: str = "remove"): - """Go through all jobs in the current graph to remove all jobs with a lower keep value that the given minimum""" +def cleanup_keep_value( + min_keep_value, load_from: str = "", mode: str = "remove", filter_affected: Optional[List[str]] = None +): + """Go through all jobs in the current graph to remove all jobs with a lower keep value that the given minimum + + :param min_keep_value: Remove jobs with lower keep value than this + :param load_from: File name to load list with used directories + :param mode: Cleanup mode ('remove', 'move', or 'dryrun') + :param filter_affected: Defines what substrings should be printed when listing affected directories + + """ if min_keep_value <= 0: logging.error("Keep value must be larger than 0") if load_from: @@ -291,17 +314,25 @@ def cleanup_keep_value(min_keep_value, load_from: str = "", mode: str = "remove" to_remove = find_too_low_keep_value(job_dirs, min_keep_value) remove_directories( - to_remove, "Remove jobs with lower keep value than min", move_postfix=".cleanup", mode=mode, force=False + to_remove, + "Remove jobs with lower keep value than min", + move_postfix=".cleanup", + mode=mode, + force=False, + filter_affected=filter_affected, ) -def cleanup_unused(load_from: str = "", job_dirs=None, mode="remove"): +def cleanup_unused( + load_from: str = "", job_dirs: List = None, mode: str = "remove", filter_affected: Optional[List[str]] = None +): """Check work directory and remove all subdirectories which do not belong to the given list of directories. If no input is given it removes everything that is not in the current graph :param load_from: File name to load list with used directories :param job_dirs: Already loaded list of used directories :param mode: Cleanup mode ('remove', 'move', or 'dryrun') + :param filter_affected: Defines what substrings should be printed when listing affected directories :return: """ if job_dirs: @@ -311,4 +342,4 @@ def cleanup_unused(load_from: str = "", job_dirs=None, mode="remove"): else: job_dirs = list_all_graph_directories() to_remove = search_for_unused(job_dirs, verbose=True) - remove_directories(to_remove, "Not used in graph", mode=mode, force=False) + remove_directories(to_remove, "Not used in graph", mode=mode, force=False, filter_affected=filter_affected) diff --git a/sisyphus/global_settings.py b/sisyphus/global_settings.py index f0fbda9..5f509b0 100644 --- a/sisyphus/global_settings.py +++ b/sisyphus/global_settings.py @@ -161,6 +161,9 @@ def file_caching(path): JOB_CLEANUP_KEEP_INPUT = True #: Default value for job used by tk.cleaner to determine if a job should be removed or not JOB_DEFAULT_KEEP_VALUE = 50 +#: +CLEANER_PRINT_ALIAS = True + #: How many threads should update the graph in parallel, useful if the filesystem has a high latency GRAPH_WORKER = 16 From 9f5db2e4d989b43a372732b4518447990d975432 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Nov 2024 12:28:30 +0100 Subject: [PATCH 2/8] update typing --- sisyphus/cleaner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index 48c25cc..7a5ab97 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -23,6 +23,7 @@ from typing import List, Optional from sisyphus import graph +from sisyphus.job import Job import sisyphus.global_settings as gs @@ -324,7 +325,7 @@ def cleanup_keep_value( def cleanup_unused( - load_from: str = "", job_dirs: List = None, mode: str = "remove", filter_affected: Optional[List[str]] = None + load_from: str = "", job_dirs: List[Job] = None, mode: str = "remove", filter_affected: Optional[List[str]] = None ): """Check work directory and remove all subdirectories which do not belong to the given list of directories. If no input is given it removes everything that is not in the current graph From 9024dd569896f95e9644a09c6128bb2abc530e41 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Thu, 14 Nov 2024 12:37:56 +0100 Subject: [PATCH 3/8] Update sisyphus/cleaner.py Co-authored-by: michelwi --- sisyphus/cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index 7a5ab97..5ab32d5 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -296,7 +296,7 @@ def cleanup_jobs(): def cleanup_keep_value( - min_keep_value, load_from: str = "", mode: str = "remove", filter_affected: Optional[List[str]] = None + min_keep_value: int, load_from: str = "", mode: str = "remove", filter_affected: Optional[List[str]] = None ): """Go through all jobs in the current graph to remove all jobs with a lower keep value that the given minimum From 84d71b47e0c66127fed2406de02c0780e8d525ef Mon Sep 17 00:00:00 2001 From: Benedikt Date: Fri, 15 Nov 2024 09:56:34 +0100 Subject: [PATCH 4/8] add affected --- sisyphus/cleaner.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index 5ab32d5..b69ad25 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -19,6 +19,7 @@ import shutil import sys import tempfile +import time from typing import List, Optional @@ -209,7 +210,7 @@ def search_for_unused(job_dirs, current=gs.WORK_DIR, verbose=True): return unused -def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", force=False, filter_affected=None): +def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", force=False, filter_affected=None, filter_printed=None): """list all directories that will be deleted and add a security check""" if isinstance(dirs, str): dirs = load_remove_list(dirs) @@ -241,13 +242,15 @@ def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", fo s = "" else: s = "" - if filter_affected is None or any(x in i for x in filter_affected): + if filter_printed is None or any(x in i for x in filter_printed): logging.info(i + " " + s) else: with tempfile.NamedTemporaryFile(mode="w") as tmp_file: for directory in dirs: - tmp_file.write(directory + "\x00") + if filter_affected is None or any(x in directory for x in filter_affected): + tmp_file.write(directory + "\x00") + time.sleep(1000) tmp_file.flush() command = "du -sch --files0-from=%s" % tmp_file.name p = os.popen(command) @@ -325,7 +328,7 @@ def cleanup_keep_value( def cleanup_unused( - load_from: str = "", job_dirs: List[Job] = None, mode: str = "remove", filter_affected: Optional[List[str]] = None + load_from: str = "", job_dirs: List[Job] = None, mode: str = "remove", filter_affected: Optional[List[str]] = None, filter_printed: Optional[List[str]] = None, ): """Check work directory and remove all subdirectories which do not belong to the given list of directories. If no input is given it removes everything that is not in the current graph @@ -333,7 +336,8 @@ def cleanup_unused( :param load_from: File name to load list with used directories :param job_dirs: Already loaded list of used directories :param mode: Cleanup mode ('remove', 'move', or 'dryrun') - :param filter_affected: Defines what substrings should be printed when listing affected directories + :param filter_affected: Only Jobs matching the substring will be deleted + :param filter_printed: Defines what substrings should be printed when listing affected directories :return: """ if job_dirs: @@ -343,4 +347,4 @@ def cleanup_unused( else: job_dirs = list_all_graph_directories() to_remove = search_for_unused(job_dirs, verbose=True) - remove_directories(to_remove, "Not used in graph", mode=mode, force=False, filter_affected=filter_affected) + remove_directories(to_remove, "Not used in graph", mode=mode, force=False, filter_affected=filter_affected, filter_printed=filter_printed) From 29a6e3bc6008591f150e74079556a5cbb95e0136 Mon Sep 17 00:00:00 2001 From: Benedikt Date: Fri, 15 Nov 2024 09:57:50 +0100 Subject: [PATCH 5/8] update --- sisyphus/cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index b69ad25..1d51b6b 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -242,7 +242,7 @@ def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", fo s = "" else: s = "" - if filter_printed is None or any(x in i for x in filter_printed): + if filter_printed is None or any(x in i for x in filter_printed) and (filter_affected is None or any(x in i for x in filter_affected)): logging.info(i + " " + s) else: From f760f0ba2fd36eb0134ddb274aff7131b0012773 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Fri, 15 Nov 2024 11:51:39 +0100 Subject: [PATCH 6/8] add more features --- sisyphus/cleaner.py | 76 +++++++++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index 1d51b6b..6cd0323 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -21,10 +21,11 @@ import tempfile import time -from typing import List, Optional +from typing import Dict, List, Optional, Set, Union from sisyphus import graph from sisyphus.job import Job +from sisyphus.job_path import Path import sisyphus.global_settings as gs @@ -84,11 +85,17 @@ def extract_keep_values_from_graph(): return job_dirs -def find_too_low_keep_value(job_dirs, min_keep_value): +def find_too_low_keep_value( + job_dirs: Union[str, Dict[Union[str, Path], int]], + min_keep_value: int, + filter_removed_jobs: Optional[List[Union[str, Path]]] = None, +): """Check all given job if they can be removed and have a keep value lower min_keep_value. :param job_dirs: dict with all keep values, can be created with extract_keep_values_from_graph :param min_keep_value: minimal keep value + :param filter_removed_jobs: Only Jobs matching the substring will be deleted + :return: """ if isinstance(job_dirs, str): @@ -100,7 +107,7 @@ def find_too_low_keep_value(job_dirs, min_keep_value): continue if keep_value == JOB_WITHOUT_KEEP_VALUE: keep_value = gs.JOB_DEFAULT_KEEP_VALUE - if keep_value < min_keep_value: + if keep_value < min_keep_value and (filter_removed_jobs is None or any(x in path for x in filter_removed_jobs)): to_remove.add(path) return to_remove @@ -121,7 +128,7 @@ def list_all_graph_directories(): return job_dirs -def save_used_paths(outfile=None, job_dirs=None): +def save_used_paths(outfile: Union[str, Path] = None, job_dirs: Dict[Union[str, Path], int] = None): """Write dict of directories in the graph to file :param outfile: Filename of output file, if not given write to stdout @@ -137,7 +144,7 @@ def save_used_paths(outfile=None, job_dirs=None): out.close() -def load_used_paths(infile): +def load_used_paths(infile: Union[str, Path]): """Load list save with save_used_paths :param infile: Filename to load from @@ -151,7 +158,7 @@ def load_used_paths(infile): return job_dirs -def save_remove_list(to_remove, outfile): +def save_remove_list(to_remove: List[Union[str, Path]], outfile: Union[str, Path]): """Write list of files that should be removed to file :param to_remove: List of directories :param outfile: Filename of output file @@ -162,7 +169,7 @@ def save_remove_list(to_remove, outfile): f.write(i + "\n") -def load_remove_list(infile): +def load_remove_list(infile: Union[str, Path]): """Load list save with save_remove_list :param infile: Filename to load from @@ -175,12 +182,19 @@ def load_remove_list(infile): return out -def search_for_unused(job_dirs, current=gs.WORK_DIR, verbose=True): +def search_for_unused( + job_dirs: Union[str, Dict[Union[str, Path], int]], + current: str = gs.WORK_DIR, + verbose: bool = True, + filter_unused: Optional[List[str]] = None, +): """Check work directory and list all subdirectories which do not belong to the given list of directories. :param job_dirs: dict with all used directories, can be created with list_all_graph_directories. :param current: current work directory :param verbose: make it verbose + :param filter_unused: Only Jobs matching the substring will be deleted + :return: List with all unused directories """ @@ -196,11 +210,11 @@ def search_for_unused(job_dirs, current=gs.WORK_DIR, verbose=True): path = os.path.join(current, short_path) status = job_dirs.get(path) - if status is None: + if status is None and (filter_unused is None or any(x in path for x in filter_unused)): unused.add(path) elif status == DIR_IN_GRAPH: # directory has sub directories used by current graph - found = search_for_unused(job_dirs, path, verbose) + found = search_for_unused(job_dirs, path, verbose, filter_unused=filter_unused) unused.update(found) if verbose: logging.info("found %s unused directories in %s (total so far: %s)" % (len(found), path, len(unused))) @@ -210,7 +224,14 @@ def search_for_unused(job_dirs, current=gs.WORK_DIR, verbose=True): return unused -def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", force=False, filter_affected=None, filter_printed=None): +def remove_directories( + dirs: Union[str, Dict[Union[str, Path], int], Set], + message: str, + move_postfix: str = ".cleanup", + mode: str = "remove", + force: bool = False, + filter_printed: Optional[List[str]] = None, +): """list all directories that will be deleted and add a security check""" if isinstance(dirs, str): dirs = load_remove_list(dirs) @@ -242,15 +263,13 @@ def remove_directories(dirs, message, move_postfix=".cleanup", mode="remove", fo s = "" else: s = "" - if filter_printed is None or any(x in i for x in filter_printed) and (filter_affected is None or any(x in i for x in filter_affected)): + if filter_printed is None or any(x in i for x in filter_printed): logging.info(i + " " + s) else: with tempfile.NamedTemporaryFile(mode="w") as tmp_file: for directory in dirs: - if filter_affected is None or any(x in directory for x in filter_affected): - tmp_file.write(directory + "\x00") - time.sleep(1000) + tmp_file.write(directory + "\x00") tmp_file.flush() command = "du -sch --files0-from=%s" % tmp_file.name p = os.popen(command) @@ -299,14 +318,19 @@ def cleanup_jobs(): def cleanup_keep_value( - min_keep_value: int, load_from: str = "", mode: str = "remove", filter_affected: Optional[List[str]] = None + min_keep_value: int, + load_from: str = "", + mode: str = "remove", + filter_removed_jobs: Optional[List[str]] = None, + filter_printed: Optional[List[str]] = None, ): """Go through all jobs in the current graph to remove all jobs with a lower keep value that the given minimum :param min_keep_value: Remove jobs with lower keep value than this :param load_from: File name to load list with used directories :param mode: Cleanup mode ('remove', 'move', or 'dryrun') - :param filter_affected: Defines what substrings should be printed when listing affected directories + :param filter_removed_jobs: Only Jobs matching the substring will be deleted + :param filter_printed: Defines what substrings should be printed when listing affected directories """ if min_keep_value <= 0: @@ -316,19 +340,23 @@ def cleanup_keep_value( else: job_dirs = extract_keep_values_from_graph() - to_remove = find_too_low_keep_value(job_dirs, min_keep_value) + to_remove = find_too_low_keep_value(job_dirs, min_keep_value, filter_removed_jobs=filter_removed_jobs) remove_directories( to_remove, "Remove jobs with lower keep value than min", move_postfix=".cleanup", mode=mode, force=False, - filter_affected=filter_affected, + filter_printed=filter_printed, ) def cleanup_unused( - load_from: str = "", job_dirs: List[Job] = None, mode: str = "remove", filter_affected: Optional[List[str]] = None, filter_printed: Optional[List[str]] = None, + load_from: str = "", + job_dirs: List[Job] = None, + mode: str = "remove", + filter_unused: Optional[List[str]] = None, + filter_printed: Optional[List[str]] = None, ): """Check work directory and remove all subdirectories which do not belong to the given list of directories. If no input is given it removes everything that is not in the current graph @@ -336,7 +364,7 @@ def cleanup_unused( :param load_from: File name to load list with used directories :param job_dirs: Already loaded list of used directories :param mode: Cleanup mode ('remove', 'move', or 'dryrun') - :param filter_affected: Only Jobs matching the substring will be deleted + :param filter_unused: Only Jobs matching the substring will be deleted :param filter_printed: Defines what substrings should be printed when listing affected directories :return: """ @@ -346,5 +374,7 @@ def cleanup_unused( job_dirs = load_used_paths(load_from) else: job_dirs = list_all_graph_directories() - to_remove = search_for_unused(job_dirs, verbose=True) - remove_directories(to_remove, "Not used in graph", mode=mode, force=False, filter_affected=filter_affected, filter_printed=filter_printed) + to_remove = search_for_unused(job_dirs, verbose=True, filter_unused=filter_unused) + remove_directories( + to_remove, "Not used in graph", mode=mode, force=False, filter_printed=filter_printed, + ) From a2d9eeb755d19e65b82ca4aa8861611e10b7bba8 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Fri, 15 Nov 2024 12:11:40 +0100 Subject: [PATCH 7/8] black --- sisyphus/cleaner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index 6cd0323..b26b74b 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -376,5 +376,9 @@ def cleanup_unused( job_dirs = list_all_graph_directories() to_remove = search_for_unused(job_dirs, verbose=True, filter_unused=filter_unused) remove_directories( - to_remove, "Not used in graph", mode=mode, force=False, filter_printed=filter_printed, + to_remove, + "Not used in graph", + mode=mode, + force=False, + filter_printed=filter_printed, ) From 4b5a66c470c3a36d9b9a7831545dcb489cebd105 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Sat, 16 Nov 2024 10:59:00 +0100 Subject: [PATCH 8/8] add return types --- sisyphus/cleaner.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sisyphus/cleaner.py b/sisyphus/cleaner.py index b26b74b..36c573b 100644 --- a/sisyphus/cleaner.py +++ b/sisyphus/cleaner.py @@ -19,7 +19,6 @@ import shutil import sys import tempfile -import time from typing import Dict, List, Optional, Set, Union @@ -36,7 +35,7 @@ JOB_WITHOUT_KEEP_VALUE = -1 -def extract_keep_values_from_graph(): +def extract_keep_values_from_graph() -> Dict[str, int]: """Go through loaded graph and create dict with all jobs and keep values :return: @@ -89,7 +88,7 @@ def find_too_low_keep_value( job_dirs: Union[str, Dict[Union[str, Path], int]], min_keep_value: int, filter_removed_jobs: Optional[List[Union[str, Path]]] = None, -): +) -> Set[Union[Path, str]]: """Check all given job if they can be removed and have a keep value lower min_keep_value. :param job_dirs: dict with all keep values, can be created with extract_keep_values_from_graph @@ -112,7 +111,7 @@ def find_too_low_keep_value( return to_remove -def list_all_graph_directories(): +def list_all_graph_directories() -> Dict[str, int]: """Create dict containing all filesystem directories used by jobs inside the loaded graph :return: dict @@ -144,7 +143,7 @@ def save_used_paths(outfile: Union[str, Path] = None, job_dirs: Dict[Union[str, out.close() -def load_used_paths(infile: Union[str, Path]): +def load_used_paths(infile: Union[str, Path]) -> Dict[str, int]: """Load list save with save_used_paths :param infile: Filename to load from @@ -169,7 +168,7 @@ def save_remove_list(to_remove: List[Union[str, Path]], outfile: Union[str, Path f.write(i + "\n") -def load_remove_list(infile: Union[str, Path]): +def load_remove_list(infile: Union[str, Path]) -> List[str]: """Load list save with save_remove_list :param infile: Filename to load from @@ -187,7 +186,7 @@ def search_for_unused( current: str = gs.WORK_DIR, verbose: bool = True, filter_unused: Optional[List[str]] = None, -): +) -> Set[str]: """Check work directory and list all subdirectories which do not belong to the given list of directories. :param job_dirs: dict with all used directories, can be created with list_all_graph_directories.