Skip to content

Commit

Permalink
allow usage of user-defined docker images in runpod (skypilot-org#3728)
Browse files Browse the repository at this point in the history
* allow usage of user-defined docker images in runpod

* fix docker image support w runpod

* don't install ray, expect the docker image should have it

* add TODO for docker image size

Co-authored-by: Zhanghao Wu <[email protected]>

* runpod has imaged support

Co-authored-by: Zhanghao Wu <[email protected]>

* Fix docker ssh setups for custom docker image

* fix for axolotl image

* Add docs

* fix conda commands

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
winglian and Michaelvll authored Aug 16, 2024
1 parent 69838f5 commit 61c9c87
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 11 deletions.
3 changes: 2 additions & 1 deletion docs/source/examples/docker-containers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ SkyPilot can run a container either as a task, or as the runtime environment of

.. note::

Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.
Running docker containers is `not supported on RunPod <https://docs.runpod.io/references/faq#can-i-run-my-own-docker-daemon-on-runpod>`_. To use RunPod, either use your docker image (the username should be ``root`` for RunPod) :ref:`as a runtime environment <docker-containers-as-runtime-environments>` or use ``setup`` and ``run`` to configure your environment. See `GitHub issue <https://github.com/skypilot-org/skypilot/issues/3096#issuecomment-2150559797>`_ for more.


.. _docker-containers-as-tasks:

Expand Down
18 changes: 14 additions & 4 deletions sky/clouds/runpod.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ class RunPod(clouds.Cloud):
clouds.CloudImplementationFeatures.MULTI_NODE:
('Multi-node not supported yet, as the interconnection among nodes '
'are non-trivial on RunPod.'),
clouds.CloudImplementationFeatures.IMAGE_ID:
('Specifying image ID is not supported on RunPod.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is currently not supported on {_REPR}.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
('Customizing disk tier is not supported yet on RunPod.'),
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
Expand Down Expand Up @@ -175,10 +171,18 @@ def make_deploy_resources_variables(
else:
custom_resources = None

if r.image_id is None:
image_id = 'runpod/base:0.0.2'
elif r.extract_docker_image() is not None:
image_id = r.extract_docker_image()
else:
image_id = r.image_id[r.region]

return {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
'image_id': image_id,
}

def _get_feasible_launchable_resources(
Expand Down Expand Up @@ -276,3 +280,9 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
return service_catalog.validate_region_zone(region,
zone,
clouds='runpod')

@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
# TODO: use 0.0 for now to allow all images. We should change this to
# return the docker image size.
return 0.0
4 changes: 3 additions & 1 deletion sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
instance_type=config.node_config['InstanceType'],
region=region,
disk_size=config.node_config['DiskSize'],
ports=config.ports_to_open_on_launch)
image_name=config.node_config['ImageId'],
ports=config.ports_to_open_on_launch,
public_key=config.node_config['PublicKey'])
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
raise
Expand Down
33 changes: 30 additions & 3 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""RunPod library wrapper for SkyPilot."""

import base64
import time
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -95,7 +96,7 @@ def list_instances() -> Dict[str, Dict[str, Any]]:


def launch(name: str, instance_type: str, region: str, disk_size: int,
ports: Optional[List[int]]) -> str:
image_name: str, ports: Optional[List[int]], public_key: str) -> str:
"""Launches an instance with the given parameters.
Converts the instance_type to the RunPod GPU name, finds the specs for the
Expand All @@ -106,6 +107,31 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
cloud_type = instance_type.split('_')[2]

gpu_specs = runpod.runpod.get_gpu(gpu_type)
# TODO(zhwu): keep this align with setups in
# `provision.kuberunetes.instance.py`
setup_cmd = (
'prefix_cmd() '
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
'$(prefix_cmd) apt update;'
'export DEBIAN_FRONTEND=noninteractive;'
'$(prefix_cmd) apt install openssh-server rsync curl patch -y;'
'$(prefix_cmd) mkdir -p /var/run/sshd; '
'$(prefix_cmd) '
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
'/etc/ssh/sshd_config; '
'$(prefix_cmd) sed '
'"s@session\\s*required\\s*pam_loginuid.so@session optional '
'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
'$(prefix_cmd) mkdir -p ~/.ssh; '
'$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
'$(prefix_cmd) chmod 700 ~/.ssh; '
f'$(prefix_cmd) echo "{public_key}" >> ~/.ssh/authorized_keys; '
'$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
'$(prefix_cmd) service ssh restart; '
'[ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc;sleep infinity')
# Use base64 to deal with the tricky quoting issues caused by runpod API.
encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')

# Port 8081 is occupied for nginx in the base image.
custom_ports_str = ''
Expand All @@ -114,7 +140,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,

new_instance = runpod.runpod.create_pod(
name=name,
image_name='runpod/base:0.0.2',
image_name=image_name,
gpu_type_id=gpu_type,
cloud_type=cloud_type,
container_disk_in_gb=disk_size,
Expand All @@ -127,7 +153,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http'),
support_public_ip=True,
)
docker_args=
f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'')

return new_instance['id']

Expand Down
8 changes: 6 additions & 2 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,14 @@
CONDA_INSTALLATION_COMMANDS = (
'which conda > /dev/null 2>&1 || '
'{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
'bash Miniconda3-Linux-x86_64.sh -b && '
# We do not use && for installation of conda and the following init commands
# because for some images, conda is already installed, but not initialized.
# In this case, we need to initialize conda and set auto_activate_base to
# true.
'{ bash Miniconda3-Linux-x86_64.sh -b; '
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
'conda config --set auto_activate_base true && '
f'conda activate base; }}; '
f'conda activate base; }}; }}; '
'grep "# >>> conda initialize >>>" ~/.bashrc || '
'{ conda init && source ~/.bashrc; };'
# If Python version is larger then equal to 3.12, create a new conda env
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ available_node_types:
node_config:
InstanceType: {{instance_type}}
DiskSize: {{disk_size}}
ImageId: {{image_id}}
PublicKey: |-
skypilot:ssh_public_key_content

head_node_type: ray_head_default

Expand Down

0 comments on commit 61c9c87

Please sign in to comment.