skypilot-org · Michaelvll · Oct 29, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024
diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst
@@ -282,6 +282,28 @@ candidate resources for a job. See documentation :ref:`here
 In this example, SkyPilot will perform cost optimizations to select the resource to use, which almost certainly
 will be spot instances. If spot instances are not available, SkyPilot will fall back to launch on-demand instances.
 
+
+Advanced Strategies for Recovery
+---------------------------------
+
+Recovery on User Code Crash/Failure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, SkyPilot will try to recover the job when the cluster is preempted or failed. In some cases, you may want to
+retry the job on its own failure, e.g., when a training job crashes due to a Nvidia driver issue or NCCL timeouts. To specify this, you
+can further set :code:`max_retry_on_failure` in :code:`resources.job_recovery` in the job YAML file.
+
+.. code-block:: yaml
+
+  resources:
+    accelerators: A100:8
+    job_recovery:
+      max_retry_on_failure: 3
+
+
+Recovery Policies
+~~~~~~~~~~~~~~~~~
+
 More advanced policies for resource selection, such as the `Can't Be Late
 <https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao>`__ (NSDI'24)
 paper, may be supported in the future.

diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
@@ -107,6 +107,10 @@ Available fields:
       #
       # default: EAGER_NEXT_REGION
       job_recovery: none
+      # Or, to allow up to 3 retries on failure:
+      # job_recovery:
+      #   strategy: EAGER_NEXT_REGION
+      #   max_retry_on_failure: 3
 
       # Disk size in GB to allocate for OS (mounted at /). Increase this if you
       # have a large working directory or tasks that write out large outputs.

diff --git a/sky/jobs/controller.py b/sky/jobs/controller.py
@@ -160,22 +160,26 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         if task_id == 0:
             submitted_at = backend_utils.get_timestamp_from_run_timestamp(
                 self._backend.run_timestamp)
+        assert task.name is not None, task
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+            task.name, self._job_id)
+        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
+            cluster_name, self._backend, task, self._retry_until_up)
         managed_job_state.set_submitted(
             self._job_id,
             task_id,
             self._backend.run_timestamp,
             submitted_at,
             resources_str=backend_utils.get_task_resources_str(
                 task, is_managed_job=True),
+            specs={
+                'max_restarts_on_failure':
+                    self._strategy_executor.max_restarts_on_failure
+            },
             callback_func=callback_func)
         logger.info(
             f'Submitted managed job {self._job_id} (task: {task_id}, name: '
             f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
-        assert task.name is not None, task
-        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, self._job_id)
-        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._retry_until_up)
 
         logger.info('Started monitoring.')
         managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                     failure_reason = (
                         'To see the details, run: '
                         f'sky jobs logs --controller {self._job_id}')
-
-                    managed_job_state.set_failed(
-                        self._job_id,
-                        task_id,
-                        failure_type=managed_job_status,
-                        failure_reason=failure_reason,
-                        end_time=end_time,
-                        callback_func=callback_func)
-                    return False
-                # Although the cluster is healthy, we fail to access the
-                # job status. Try to recover the job (will not restart the
-                # cluster, if the cluster is healthy).
-                assert job_status is None, job_status
-                logger.info('Failed to fetch the job status while the '
-                            'cluster is healthy. Try to recover the job '
-                            '(the cluster will not be restarted).')
-
+                    trigger_retry_on_failure = (
+                        self._strategy_executor.trigger_retry_on_failure())
+                    if trigger_retry_on_failure:
+                        max_restarts = (
+                            self._strategy_executor.max_restarts_on_failure)
+                        logger.info(
+                            f'User program crashed '
+                            f'({managed_job_status.value}). '
+                            f'Retry the job as max_restarts_on_failure is '
+                            f'set to {max_restarts}. '
+                            f'[{self._strategy_executor.retry_cnt_on_failure}/'
+                            f'{max_restarts}]')
+                    else:
+                        managed_job_state.set_failed(
+                            self._job_id,
+                            task_id,
+                            failure_type=managed_job_status,
+                            failure_reason=failure_reason,
+                            end_time=end_time,
+                            callback_func=callback_func)
+                        return False
+                else:
+                    # Although the cluster is healthy, we fail to access the
+                    # job status. Try to recover the job (will not restart the
+                    # cluster, if the cluster is healthy).
+                    assert job_status is None, job_status
+                    logger.info('Failed to fetch the job status while the '
+                                'cluster is healthy. Try to recover the job '
+                                '(the cluster will not be restarted).')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources

diff --git a/sky/jobs/recovery_strategy.py b/sky/jobs/recovery_strategy.py
@@ -66,7 +66,8 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
 
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_failure: int) -> None:
         """Initialize the strategy executor.
 
         Args:
@@ -82,6 +83,8 @@ def __init__(self, cluster_name: str, backend: 'backends.Backend',
         self.cluster_name = cluster_name
         self.backend = backend
         self.retry_until_up = retry_until_up
+        self.max_restarts_on_failure = max_restarts_on_failure
+        self.retry_cnt_on_failure = 0
 
     def __init_subclass__(cls, name: str, default: bool = False):
         RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ def make(cls, cluster_name: str, backend: 'backends.Backend',
         # set the new_task_resources to be the same type (list or set) as the
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
-        return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
-                                                 retry_until_up)
+        if isinstance(job_recovery, dict):
+            job_recovery_name = job_recovery.pop('strategy',
+                                                 DEFAULT_RECOVERY_STRATEGY)
+            max_restarts_on_failure = job_recovery.pop(
+                'max_restarts_on_failure', 0)
+        else:
+            job_recovery_name = job_recovery
+            max_restarts_on_failure = 0
+        return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
+                                                      task, retry_until_up,
+                                                      max_restarts_on_failure)
 
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -368,6 +380,13 @@ def _launch(self,
                         f'{gap_seconds:.1f} seconds.')
             time.sleep(gap_seconds)
 
+    def trigger_retry_on_failure(self) -> bool:
+        """Trigger a retry on failure."""
+        self.retry_cnt_on_failure += 1
+        if self.retry_cnt_on_failure >= self.max_restarts_on_failure:
+            return False
+        return True
+
 
 class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
                                default=False):
@@ -376,8 +395,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
 
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
-        super().__init__(cluster_name, backend, task, retry_until_up)
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_failure: int) -> None:
+        super().__init__(cluster_name, backend, task, retry_until_up,
+                         max_restarts_on_failure)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is

diff --git a/sky/jobs/state.py b/sky/jobs/state.py
@@ -2,6 +2,7 @@
 # TODO(zhwu): maybe use file based status instead of database, so
 # that we can easily switch to a s3-based storage.
 import enum
+import json
 import pathlib
 import sqlite3
 import time
@@ -65,7 +66,8 @@ def _get_db_path() -> str:
     failure_reason TEXT,
     spot_job_id INTEGER,
     task_id INTEGER DEFAULT 0,
-    task_name TEXT)""")
+    task_name TEXT,
+    specs TEXT)""")
 _CONN.commit()
 
 db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ def _get_db_path() -> str:
                              'TEXT',
                              copy_from='job_name')
 
+# Specs is some useful information about the task, e.g., the
+# max_restarts_on_failure value. It is stored in JSON format.
+db_utils.add_column_to_table(_CURSOR,
+                             _CONN,
+                             'spot',
+                             'specs',
+                             'TEXT',
+                             value_to_replace_existing_entries=json.dumps({
+                                 'max_restarts_on_failure': 0,
+                             }))
+
 # `job_info` contains the mapping from job_id to the job_name.
 # In the future, it may contain more information about each job.
 _CURSOR.execute("""\
@@ -130,7 +143,8 @@ def _get_db_path() -> str:
     'task_name',
     # columns from the job_info table
     '_job_info_job_id',  # This should be the same as job_id
-    'job_name'
+    'job_name',
+    'specs',
 ]
 
 
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
 
 def set_submitted(job_id: int, task_id: int, run_timestamp: str,
                   submit_time: float, resources_str: str,
-                  callback_func: CallbackType):
+                  specs: Dict[str, Union[str,
+                                         int]], callback_func: CallbackType):
     """Set the task to submitted.
 
     Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             determine the log directory of the managed task.
         submit_time: The time when the managed task is submitted.
         resources_str: The resources string of the managed task.
+        specs: The specs of the managed task.
+        callback_func: The callback function.
     """
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             resources=(?),
             submitted_at=(?),
             status=(?),
-            run_timestamp=(?)
+            run_timestamp=(?),
+            specs=(?)
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
             (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
-             run_timestamp, job_id, task_id))
+             run_timestamp, json.dumps(specs), job_id, task_id))
     callback_func('SUBMITTED')
 
 
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
         for (job_id,) in rows:
             return job_id
         return None
+
+
+def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        task_specs = cursor.execute(
+            """\
+            SELECT specs FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return json.loads(task_specs[0])
diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py
@@ -70,7 +70,7 @@
 # state, after the job finished. This is a safeguard to avoid the case where
 # the managed job status fails to be updated and keep the `sky jobs logs`
 # blocking for a long time.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
 
 
 class UserSignal(enum.Enum):
@@ -392,8 +392,10 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             f'INFO: Log for the current task ({task_id}) '
                             'is finished. Waiting for the next task\'s log '
                             'to be started.')
-                        status_display.update('Waiting for the next task: '
-                                              f'{task_id + 1}.')
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                f'Waiting for the next task: {task_id + 1}'))
                         status_display.start()
                         original_task_id = task_id
                         while True:
@@ -405,7 +407,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
                         continue
                     else:
-                        break
+                        task_specs = managed_job_state.get_task_specs(
+                            job_id, task_id)
+                        if task_specs.get('max_restarts_on_failure', 0) == 0:
+                            # We don't need to wait for the managed job status
+                            # update, as the job is guaranteed to be in terminal
+                            # state afterwards.
+                            break
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                'Waiting for next retry for the failed task'))
+                        status_display.start()
+                        while True:
+                            _, managed_job_status = (
+                                managed_job_state.get_latest_task_id_status(
+                                    job_id))
+                            if (managed_job_status !=
+                                    managed_job_state.ManagedJobStatus.RUNNING):
+                                break
+                            time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
+                        continue
                 # The job can be cancelled by the user or the controller (when
                 # the cluster is partially preempted).
                 logger.debug(

diff --git a/sky/resources.py b/sky/resources.py
@@ -163,8 +163,18 @@ def __init__(
         self._use_spot = use_spot if use_spot is not None else False
         self._job_recovery = None
         if job_recovery is not None:
-            if job_recovery.strip().lower() != 'none':
-                self._job_recovery = job_recovery.upper()
+            if isinstance(job_recovery, str):
+                job_recovery = {'strategy': job_recovery}
+            if 'strategy' not in job_recovery:
+                job_recovery['strategy'] = None
+
+            strategy_name = job_recovery['strategy']
+            if strategy_name == 'none':
+                self._job_recovery = None
+            else:
+                if strategy_name is not None:
+                    job_recovery['strategy'] = strategy_name.upper()
+                self._job_recovery = job_recovery
 
         if disk_size is not None:
             if round(disk_size) != disk_size:
@@ -419,7 +429,7 @@ def use_spot_specified(self) -> bool:
         return self._use_spot_specified
 
     @property
-    def job_recovery(self) -> Optional[str]:
+    def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
         return self._job_recovery
 
     @property
@@ -814,12 +824,13 @@ def _try_validate_managed_job_attributes(self) -> None:
         Raises:
             ValueError: if the attributes are invalid.
         """
-        if self._job_recovery is None:
+        if self._job_recovery is None or self._job_recovery['strategy'] is None:
             return
-        if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
+        if (self._job_recovery['strategy']
+                not in managed_jobs.RECOVERY_STRATEGIES):
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'Spot recovery strategy {self._job_recovery} '
+                    f'Spot recovery strategy {self._job_recovery["strategy"]} '
                     'is not supported. The strategy should be among '
                     f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')