Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Add retry for apparmor failures #4176

Merged
merged 2 commits into from
Oct 25, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Kubernetes instance provisioning."""
import copy
import json
import time
from typing import Any, Dict, List, Optional
import uuid
Expand Down Expand Up @@ -425,6 +426,70 @@ def _label_pod(namespace: str, context: Optional[str], pod_name: str,
_request_timeout=kubernetes.API_TIMEOUT)


def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
context: Optional[str]) -> Any:
"""Attempts to create a Kubernetes Pod and handle any errors.

Currently, we handle errors due to the AppArmor annotation and retry if
it fails due to the `FieldValueForbidden` error.
See https://github.com/skypilot-org/skypilot/issues/4174 for details.

Returns: The created Pod object.
"""
try:
# Attempt to create the Pod with the AppArmor annotation
pod = kubernetes.core_api(context).create_namespaced_pod(
namespace, pod_spec)
return pod
except kubernetes.api_exception() as e:
try:
error_body = json.loads(e.body)
error_message = error_body.get('message', '')
except json.JSONDecodeError:
error_message = str(e.body)
# Check if the error is due to the AppArmor annotation and retry.
# We add an AppArmor annotation to set it as unconfined in our
# base template in kubernetes-ray.yml.j2. This is required for
# FUSE to work in the pod on most Kubernetes distributions.
# However, some distributions do not support the AppArmor annotation
# and will fail to create the pod. In this case, we retry without
# the annotation.
if (e.status == 422 and 'FieldValueForbidden' in error_message and
'AppArmorProfile: nil' in error_message):
logger.warning('AppArmor annotation caused pod creation to fail. '
'Retrying without the annotation. '
'Note: this may cause bucket mounting to fail.')

# Remove the AppArmor annotation
annotations = pod_spec.get('metadata', {}).get('annotations', {})
if ('container.apparmor.security.beta.kubernetes.io/ray-node'
in annotations):
del annotations[
'container.apparmor.security.beta.kubernetes.io/ray-node']
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
pod_spec['metadata']['annotations'] = annotations
logger.info('AppArmor annotation removed from Pod spec.')
else:
logger.warning('AppArmor annotation not found in pod spec, '
'retrying will not help. '
f'Current annotations: {annotations}')
raise e

# Retry Pod creation without the AppArmor annotation
try:
pod = kubernetes.core_api(context).create_namespaced_pod(
namespace, pod_spec)
logger.info(f'Pod {pod.metadata.name} created successfully '
'without AppArmor annotation.')
return pod
except kubernetes.api_exception() as retry_exception:
logger.info('Failed to create Pod without AppArmor annotation: '
f'{retry_exception}')
raise retry_exception
else:
# Re-raise the exception if it's a different error
raise e


def _create_pods(region: str, cluster_name_on_cloud: str,
config: common.ProvisionConfig) -> common.ProvisionRecord:
"""Create pods based on the config."""
Expand Down Expand Up @@ -546,8 +611,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
}
}

pod = kubernetes.core_api(context).create_namespaced_pod(
namespace, pod_spec)
pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
created_pods[pod.metadata.name] = pod
if head_pod_name is None:
head_pod_name = pod.metadata.name
Expand Down
Loading