Skip to content

Commit

Permalink
Nebius (#9)
Browse files Browse the repository at this point in the history
* K8s pod dns (#6)

* enumerate worker pods, bind pod_name to headless service

* patch worker numbering

* allow any traffic between sky pods

* add test for ssh vs hostname

* fix ip-worker mapping for k8s ssh

* lint

* Update README.md

* Update pypi-nightly-build.yml

point build to our pypi project

* patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint

patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint

lint

lint

Update pypi-nightly-build.yml

point build to our pypi project

patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint

lint

* Nebius Labeling

* nebius2

* Save local changes before merging upstream

* value error + support accelerators

* formatting

---------

Co-authored-by: Andrew Aikawa <[email protected]>
  • Loading branch information
ryanhayame and asaiacai authored Dec 12, 2024
1 parent f19667b commit e04e78b
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 2 deletions.
3 changes: 2 additions & 1 deletion sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,8 @@ def _create_pod_thread(i: int):
pod_spec_copy['metadata']['labels'].update(head_selector)
pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
else:
pod_spec_copy['metadata']['labels'].update(constants.WORKER_NODE_TAGS)
pod_spec_copy['metadata']['labels'].update(
constants.WORKER_NODE_TAGS)
pod_name = f'{cluster_name_on_cloud}-worker{i}'
pod_spec_copy['metadata']['name'] = pod_name
pod_spec_copy['metadata']['labels']['component'] = pod_name
Expand Down
39 changes: 38 additions & 1 deletion sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,20 +371,57 @@ class KarpenterLabelFormatter(SkyPilotLabelFormatter):
LABEL_KEY = 'karpenter.k8s.aws/instance-gpu-name'


class NebiusLabelFormatter(GPULabelFormatter):
"""Nebius label formatter
Nebius uses the label `node.kubernetes.io/instance-type` as the key,
and a lowercase accelerator str as the value.
ex. "node.kubernetes.io/instance-type=gpu-h100-sxm"
Nebius docs: https://docs.nebius.com/compute/virtual-machines/types/
"""
LABEL_KEY = 'node.kubernetes.io/instance-type'
SUPPORTED_ACCELERATORS = ['H100']

@classmethod
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
return cls.LABEL_KEY

@classmethod
def get_label_keys(cls) -> List[str]:
return [cls.LABEL_KEY]

@classmethod
def get_label_value(cls, accelerator: str) -> str:
if accelerator.upper() not in cls.SUPPORTED_ACCELERATORS:
raise ValueError(
f'Unsupported accelerator: {accelerator.upper()}. '
f'Supported accelerators are: {cls.SUPPORTED_ACCELERATORS}')
return 'gpu-h100-sxm'

@classmethod
def match_label_key(cls, label_key: str) -> bool:
return label_key == cls.LABEL_KEY

@classmethod
def get_accelerator_from_label_value(cls, value: str) -> str:
return 'H100'


# LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to
# discover the accelerator type from. The order of the list is important, as
# it will be used to determine the priority of the label formats when
# auto-detecting the GPU label type.
LABEL_FORMATTER_REGISTRY = [
SkyPilotLabelFormatter, GKELabelFormatter, KarpenterLabelFormatter,
GFDLabelFormatter, CoreWeaveLabelFormatter
GFDLabelFormatter, CoreWeaveLabelFormatter, NebiusLabelFormatter
]

# Mapping of autoscaler type to label formatter
AUTOSCALER_TO_LABEL_FORMATTER = {
kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter, # pylint: disable=line-too-long
kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
kubernetes_enums.KubernetesAutoscalerType.NEBIUS: NebiusLabelFormatter,
}


Expand Down
1 change: 1 addition & 0 deletions sky/utils/kubernetes_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ class KubernetesAutoscalerType(enum.Enum):
GKE = 'gke'
KARPENTER = 'karpenter'
GENERIC = 'generic'
NEBIUS = 'nebius'

0 comments on commit e04e78b

Please sign in to comment.