From 1d2721cd898abf6fde89f4fed60a85e40cf592c4 Mon Sep 17 00:00:00 2001 From: Luca Miccini Date: Wed, 23 Oct 2024 17:46:41 +0200 Subject: [PATCH] Add delay parameter We've seen issues in the past with nfs servers that would not release locks fast enough for evacuation to complete (see https://bugzilla.redhat.com/show_bug.cgi?id=1755760 for example). This commit adds a DELAY parameter so we can wait for a certain amount of time before starting the evacuation. We also take the chance to fix a typo (hardcoded value vs variable). --- templates/instanceha/bin/instanceha.py | 6 +++++- templates/instanceha/config/config.yaml | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/templates/instanceha/bin/instanceha.py b/templates/instanceha/bin/instanceha.py index ddd23010..9995c51a 100755 --- a/templates/instanceha/bin/instanceha.py +++ b/templates/instanceha/bin/instanceha.py @@ -68,6 +68,7 @@ def start_health_check_server(): TAGGED_IMAGES = config["TAGGED_IMAGES"] if 'TAGGED_IMAGES' in config else "true" TAGGED_FLAVORS = config["TAGGED_FLAVORS"] if 'TAGGED_FLAVORS' in config else "true" DELTA = int(config["DELTA"]) if 'DELTA' in config else 30 +DELAY = int(config["DELAY"]) if 'DELAY' in config else 0 POLL = int(config["POLL"]) if 'POLL' in config else 45 THRESHOLD = int(config["THRESHOLD"]) if 'THRESHOLD' in config else 50 WORKERS = int(config["WORKERS"]) if 'WORKERS' in config else 4 @@ -182,6 +183,9 @@ def _host_evacuate(connection, host): logging.info("Nothing to evacuate") return True + # sleep for DELAY, this could be useful if nfs is in use and locks need to be released + time.sleep(DELAY) + # if SMART_EVACUATION is 'True' (string) use a ThreadPoolExecutor to poll the evacuation status # otherwise use the old "fire and forget" approach if 'true' in SMART_EVACUATION.lower(): @@ -865,7 +869,7 @@ def main(): with concurrent.futures.ThreadPoolExecutor() as executor: results = list(executor.map(lambda service: process_service(service, reserved_hosts, True), to_resume)) if not all(results): - logging.warning('Some services failed to evacuate. Retrying in 30 seconds.') + logging.warning('Some services failed to evacuate. Retrying in %s seconds.' % POLL) else: logging.info('InstanceHa DISABLE is true, not evacuating') diff --git a/templates/instanceha/config/config.yaml b/templates/instanceha/config/config.yaml index a8c5fecc..7367d23c 100644 --- a/templates/instanceha/config/config.yaml +++ b/templates/instanceha/config/config.yaml @@ -4,6 +4,7 @@ config: TAGGED_FLAVORS: "true" SMART_EVACUATION: "false" DELTA: "30" + DELAY: "0" POLL: "45" THRESHOLD: "50" WORKERS: "4"