Skip to content

Commit

Permalink
Merge branch 'skypilot-org:master' into konduktor
Browse files Browse the repository at this point in the history
  • Loading branch information
asaiacai authored Aug 16, 2024
2 parents 2d9f518 + 61c9c87 commit 28cb68c
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 26 deletions.
3 changes: 2 additions & 1 deletion docs/source/examples/docker-containers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ SkyPilot can run a container either as a task, or as the runtime environment of

.. note::

Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.
Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, either use your docker image (the username should be ``root`` for RunPod) :ref:`as a runtime environment <docker-containers-as-runtime-environments>` or use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.


.. _docker-containers-as-tasks:

Expand Down
18 changes: 14 additions & 4 deletions sky/clouds/runpod.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ class RunPod(clouds.Cloud):
clouds.CloudImplementationFeatures.MULTI_NODE:
('Multi-node not supported yet, as the interconnection among nodes '
'are non-trivial on RunPod.'),
clouds.CloudImplementationFeatures.IMAGE_ID:
('Specifying image ID is not supported on RunPod.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is currently not supported on {_REPR}.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
('Customizing disk tier is not supported yet on RunPod.'),
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
Expand Down Expand Up @@ -175,10 +171,18 @@ def make_deploy_resources_variables(
else:
custom_resources = None

if r.image_id is None:
image_id = 'runpod/base:0.0.2'
elif r.extract_docker_image() is not None:
image_id = r.extract_docker_image()
else:
image_id = r.image_id[r.region]

return {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
'image_id': image_id,
}

def _get_feasible_launchable_resources(
Expand Down Expand Up @@ -276,3 +280,9 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
return service_catalog.validate_region_zone(region,
zone,
clouds='runpod')

@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
# TODO: use 0.0 for now to allow all images. We should change this to
# return the docker image size.
return 0.0
1 change: 1 addition & 0 deletions sky/clouds/utils/scp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def _delete(self, url, request_body=None):
def create_security_group(self, zone_id, vpc, sg_name):
url = f'{API_ENDPOINT}/security-group/v3/security-groups'
request_body = {
'loggable': False,
'securityGroupName': sg_name,
'serviceZoneId': zone_id,
'vpcId': vpc,
Expand Down
6 changes: 3 additions & 3 deletions sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ def add_store(self,
logger.info(f'Storage type {store_type} already exists under '
f'storage account {storage_account_name!r}.')
else:
logger.info(f'Storage type {store_type} already exist.')
logger.info(f'Storage type {store_type} already exists.')
return self.stores[store_type]

store_cls: Type[AbstractStore]
Expand Down Expand Up @@ -2123,7 +2123,7 @@ def initialize(self):
"""Initializes the AZ Container object on the cloud.
Initialization involves fetching container if exists, or creating it if
it does not. Also, it checks for the existance of the storage account
it does not. Also, it checks for the existence of the storage account
if provided by the user and the resource group is inferred from it.
If not provided, both are created with a default naming conventions.
Expand Down Expand Up @@ -2186,7 +2186,7 @@ def _get_storage_account_and_resource_group(
Raises:
StorageBucketCreateError: If storage account attempted to be
created already exists
created already exists.
NonExistentStorageAccountError: When storage account provided
either through config.yaml or local db does not exist under
user's subscription ID.
Expand Down
2 changes: 1 addition & 1 deletion sky/provision/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def bootstrap_instances(
logger.info(
f'Azure resource group {resource_group} of a recent '
f'terminated cluster {cluster_name_on_cloud} is being '
'deleted. It can only be provisioned after it is fully'
'deleted. It can only be provisioned after it is fully '
'deleted. Waiting...')
time.sleep(1)
retry += 1
Expand Down
1 change: 1 addition & 0 deletions sky/provision/gcp/mig_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,4 @@ def wait_for_managed_group_to_be_stable(project_id: str, zone: str,
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode('ascii')
logger.info(stderr)
raise
4 changes: 3 additions & 1 deletion sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
instance_type=config.node_config['InstanceType'],
region=region,
disk_size=config.node_config['DiskSize'],
ports=config.ports_to_open_on_launch)
image_name=config.node_config['ImageId'],
ports=config.ports_to_open_on_launch,
public_key=config.node_config['PublicKey'])
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
raise
Expand Down
33 changes: 30 additions & 3 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""RunPod library wrapper for SkyPilot."""

import base64
import time
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -95,7 +96,7 @@ def list_instances() -> Dict[str, Dict[str, Any]]:


def launch(name: str, instance_type: str, region: str, disk_size: int,
ports: Optional[List[int]]) -> str:
image_name: str, ports: Optional[List[int]], public_key: str) -> str:
"""Launches an instance with the given parameters.
Converts the instance_type to the RunPod GPU name, finds the specs for the
Expand All @@ -106,6 +107,31 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
cloud_type = instance_type.split('_')[2]

gpu_specs = runpod.runpod.get_gpu(gpu_type)
# TODO(zhwu): keep this align with setups in
# `provision.kuberunetes.instance.py`
setup_cmd = (
'prefix_cmd() '
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
'$(prefix_cmd) apt update;'
'export DEBIAN_FRONTEND=noninteractive;'
'$(prefix_cmd) apt install openssh-server rsync curl patch -y;'
'$(prefix_cmd) mkdir -p /var/run/sshd; '
'$(prefix_cmd) '
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
'/etc/ssh/sshd_config; '
'$(prefix_cmd) sed '
'"s@session\\s*required\\s*pam_loginuid.so@session optional '
'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
'$(prefix_cmd) mkdir -p ~/.ssh; '
'$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
'$(prefix_cmd) chmod 700 ~/.ssh; '
f'$(prefix_cmd) echo "{public_key}" >> ~/.ssh/authorized_keys; '
'$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
'$(prefix_cmd) service ssh restart; '
'[ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc;sleep infinity')
# Use base64 to deal with the tricky quoting issues caused by runpod API.
encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')

# Port 8081 is occupied for nginx in the base image.
custom_ports_str = ''
Expand All @@ -114,7 +140,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,

new_instance = runpod.runpod.create_pod(
name=name,
image_name='runpod/base:0.0.2',
image_name=image_name,
gpu_type_id=gpu_type,
cloud_type=cloud_type,
container_disk_in_gb=disk_size,
Expand All @@ -127,7 +153,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http'),
support_public_ip=True,
)
docker_args=
f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')

return new_instance['id']

Expand Down
8 changes: 6 additions & 2 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,14 @@
CONDA_INSTALLATION_COMMANDS = (
'which conda > /dev/null 2>&1 || '
'{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
'bash Miniconda3-Linux-x86_64.sh -b && '
# We do not use && for installation of conda and the following init commands
# because for some images, conda is already installed, but not initialized.
# In this case, we need to initialize conda and set auto_activate_base to
# true.
'{ bash Miniconda3-Linux-x86_64.sh -b; '
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
'conda config --set auto_activate_base true && '
f'conda activate base; }}; '
f'conda activate base; }}; }}; '
'grep "# >>> conda initialize >>>" ~/.bashrc || '
'{ conda init && source ~/.bashrc; };'
# If Python version is larger then equal to 3.12, create a new conda env
Expand Down
12 changes: 10 additions & 2 deletions sky/skylet/providers/scp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,21 @@ def get_vcp_subnets(self):

def _get_vm_init_script(self, ssh_public_key):

import subprocess
init_script_content = self._get_default_config_cmd(
) + self._get_ssh_key_gen_cmd(ssh_public_key)
init_script_content_string = f'"{init_script_content}"'
command = f'echo {init_script_content_string} | base64'
result = subprocess.run(command,
shell=True,
capture_output=True,
text=True)
init_script_content_base64 = result.stdout
return {
"encodingType": "plain",
"encodingType": "base64",
"initialScriptShell": "bash",
"initialScriptType": "text",
"initialScriptContent": init_script_content
"initialScriptContent": init_script_content_base64
}

def _get_ssh_key_gen_cmd(self, ssh_public_key):
Expand Down
18 changes: 9 additions & 9 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ docker:
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
{{docker_login_config.username}}
password: |-
{{docker_login_config.password}}
server: |-
{{docker_login_config.server}}
{%- endif %}
{%- endif %}

provider:
Expand All @@ -28,15 +37,6 @@ provider:
# instead of the cluster_name. This ensures that ray creates new instances
# for different cluster_name.
resource_group: {{resource_group}}
{%- if docker_login_config is not none %}
# We put docker login config in provider section because ray's schema disabled
# additionalProperties for docker config.
# See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227
docker_login_config:
username: {{docker_login_config.username}}
password: {{docker_login_config.password}}
server: {{docker_login_config.server}}
{%- endif %}
# Keep (otherwise cannot reuse when re-provisioning).
# teardown(terminate=True) will override this.
cache_stopped_nodes: True
Expand Down
2 changes: 2 additions & 0 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ available_node_types:
{%- if gcp_use_managed_instance_group %}
managed-instance-group:
run_duration: {{ run_duration }}
{%- if provision_timeout is defined and provision_timeout is not none %}
provision_timeout: {{ provision_timeout }}
{%- endif %}
{%- endif %}
{%- if specific_reservations %}
reservationAffinity:
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ available_node_types:
node_config:
InstanceType: {{instance_type}}
DiskSize: {{disk_size}}
ImageId: {{image_id}}
PublicKey: |-
skypilot:ssh_public_key_content

head_node_type: ray_head_default

Expand Down

0 comments on commit 28cb68c

Please sign in to comment.