diff --git a/src/ert/scheduler/driver.py b/src/ert/scheduler/driver.py index f14bc25fe1e..7740607f43c 100644 --- a/src/ert/scheduler/driver.py +++ b/src/ert/scheduler/driver.py @@ -94,7 +94,7 @@ async def _execute_with_retry( _logger = driverlogger or logging.getLogger(__name__) error_message: Optional[str] = None - for _ in range(total_attempts): + for i in range(total_attempts): try: process = await asyncio.create_subprocess_exec( *cmd_with_args, @@ -150,8 +150,8 @@ async def _execute_with_retry( ) _logger.error(error_message) return False, error_message - - await asyncio.sleep(retry_interval) + if i < (total_attempts - 1): + await asyncio.sleep(retry_interval) error_message = ( f'Command "{shlex.join(cmd_with_args)}" failed after {total_attempts} attempts ' f"with {outputs}" diff --git a/src/ert/scheduler/lsf_driver.py b/src/ert/scheduler/lsf_driver.py index df761d128a0..1bd04bee2b9 100644 --- a/src/ert/scheduler/lsf_driver.py +++ b/src/ert/scheduler/lsf_driver.py @@ -282,7 +282,7 @@ def __init__( self._max_attempt: int = 100 self._sleep_time_between_bkills = 30 self._sleep_time_between_cmd_retries = 3 - self._bsub_retries = 10 + self._max_bsub_attempts = 10 self._poll_period = _POLL_PERIOD @@ -363,7 +363,7 @@ async def submit( bsub_with_args, retry_on_empty_stdout=True, retry_codes=(FLAKY_SSH_RETURNCODE,), - total_attempts=self._bsub_retries, + total_attempts=self._max_bsub_attempts, retry_interval=self._sleep_time_between_cmd_retries, error_on_msgs=BSUB_FAILURE_MESSAGES, ) diff --git a/src/ert/scheduler/openpbs_driver.py b/src/ert/scheduler/openpbs_driver.py index cdecc94b696..dbf75ae3199 100644 --- a/src/ert/scheduler/openpbs_driver.py +++ b/src/ert/scheduler/openpbs_driver.py @@ -156,7 +156,7 @@ def __init__( self._num_cpus_per_node: Optional[int] = num_cpus_per_node self._cluster_label: Optional[str] = cluster_label self._job_prefix = job_prefix - self._num_pbs_cmd_retries = 10 + self._max_pbs_cmd_attempts = 10 self._sleep_time_between_cmd_retries = 2 self._poll_period = _POLL_PERIOD @@ -268,7 +268,7 @@ async def submit( QSUB_CONNECTION_REFUSED, ), stdin=script.encode(encoding="utf-8"), - total_attempts=self._num_pbs_cmd_retries, + total_attempts=self._max_pbs_cmd_attempts, retry_interval=self._sleep_time_between_cmd_retries, driverlogger=logger, ) @@ -298,7 +298,7 @@ async def kill(self, iens: int) -> None: [str(self._qdel_cmd), str(job_id)], retry_codes=(QDEL_REQUEST_INVALID,), accept_codes=(QDEL_JOB_HAS_FINISHED,), - total_attempts=self._num_pbs_cmd_retries, + total_attempts=self._max_pbs_cmd_attempts, retry_interval=self._sleep_time_between_cmd_retries, driverlogger=logger, ) diff --git a/src/ert/scheduler/slurm_driver.py b/src/ert/scheduler/slurm_driver.py index bd126c14715..2637a5cc919 100644 --- a/src/ert/scheduler/slurm_driver.py +++ b/src/ert/scheduler/slurm_driver.py @@ -112,7 +112,7 @@ def __init__( self._include_hosts = include_hosts self._sbatch = sbatch_cmd - self._sbatch_retries = 1 + self._max_sbatch_attempts = 1 self._scancel = scancel_cmd self._squeue = squeue_cmd @@ -217,7 +217,7 @@ async def submit( sbatch_with_args, retry_on_empty_stdout=True, retry_codes=(), - total_attempts=self._sbatch_retries, + total_attempts=self._max_sbatch_attempts, retry_interval=self._sleep_time_between_cmd_retries, ) if not process_success: diff --git a/tests/ert/unit_tests/scheduler/test_lsf_driver.py b/tests/ert/unit_tests/scheduler/test_lsf_driver.py index 6d98b722f33..ff6b9a878a5 100644 --- a/tests/ert/unit_tests/scheduler/test_lsf_driver.py +++ b/tests/ert/unit_tests/scheduler/test_lsf_driver.py @@ -282,6 +282,7 @@ async def test_faulty_bsub(monkeypatch, tmp_path, bsub_script, expectation): bsub_path.write_text(f"#!/bin/sh\n{bsub_script}") bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC) driver = LsfDriver() + driver._max_bsub_attempts = 1 with expectation: await driver.submit(0, "sleep") @@ -578,7 +579,7 @@ async def test_that_bsub_will_retry_and_fail( bsub_path.write_text(f"#!/bin/sh\necho {error_msg} >&2\nexit {exit_code}") bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC) driver = LsfDriver() - driver._bsub_retries = 2 + driver._max_bsub_attempts = 2 driver._sleep_time_between_cmd_retries = 0.2 match_str = ( f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg if error_msg else ""}"' @@ -662,7 +663,7 @@ async def test_that_bsub_will_retry_and_succeed( ) bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC) driver = LsfDriver() - driver._bsub_retries = 2 + driver._max_bsub_attempts = 2 driver._sleep_time_between_cmd_retries = 0.2 await driver.submit(0, "sleep 10") diff --git a/tests/ert/unit_tests/scheduler/test_openpbs_driver.py b/tests/ert/unit_tests/scheduler/test_openpbs_driver.py index 4a81982a980..7e72f6e7045 100644 --- a/tests/ert/unit_tests/scheduler/test_openpbs_driver.py +++ b/tests/ert/unit_tests/scheduler/test_openpbs_driver.py @@ -407,7 +407,7 @@ async def test_that_qsub_will_retry_and_fail( qsub_path.write_text(f"#!/bin/sh\necho {error_msg} >&2\nexit {exit_code}") qsub_path.chmod(qsub_path.stat().st_mode | stat.S_IEXEC) driver = OpenPBSDriver() - driver._num_pbs_cmd_retries = 2 + driver._max_pbs_cmd_attempts = 2 driver._sleep_time_between_cmd_retries = 0.2 match_str = ( f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg}"' @@ -452,7 +452,7 @@ async def test_that_qsub_will_retry_and_succeed( ) qsub_path.chmod(qsub_path.stat().st_mode | stat.S_IEXEC) driver = OpenPBSDriver() - driver._num_pbs_cmd_retries = 2 + driver._max_pbs_cmd_attempts = 2 driver._sleep_time_between_cmd_retries = 0.2 await driver.submit(0, "sleep 10") @@ -490,7 +490,7 @@ async def test_that_qdel_will_retry_and_succeed( ) qdel_path.chmod(qdel_path.stat().st_mode | stat.S_IEXEC) driver = OpenPBSDriver() - driver._num_pbs_cmd_retries = 2 + driver._max_pbs_cmd_attempts = 2 driver._retry_pbs_cmd_interval = 0.2 driver._iens2jobid[0] = 111 await driver.kill(0)