From 9e50959e03146b599d9d1b3646573c913ec95bac Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 13 Aug 2024 22:14:04 -0700 Subject: [PATCH 1/5] Minor: typo fix. (#3830) --- sky/data/storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 0caeef2bc7a..f09d79ea48e 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -869,7 +869,7 @@ def add_store(self, logger.info(f'Storage type {store_type} already exists under ' f'storage account {storage_account_name!r}.') else: - logger.info(f'Storage type {store_type} already exist.') + logger.info(f'Storage type {store_type} already exists.') return self.stores[store_type] store_cls: Type[AbstractStore] @@ -2123,7 +2123,7 @@ def initialize(self): """Initializes the AZ Container object on the cloud. Initialization involves fetching container if exists, or creating it if - it does not. Also, it checks for the existance of the storage account + it does not. Also, it checks for the existence of the storage account if provided by the user and the resource group is inferred from it. If not provided, both are created with a default naming conventions. @@ -2186,7 +2186,7 @@ def _get_storage_account_and_resource_group( Raises: StorageBucketCreateError: If storage account attempted to be - created already exists + created already exists. NonExistentStorageAccountError: When storage account provided either through config.yaml or local db does not exist under user's subscription ID. From 6f89fa5cb79766f7ef4af3b617aa1c12f1114727 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 15 Aug 2024 10:12:09 -0700 Subject: [PATCH 2/5] [Azure] Fix docker login on Azure (#3831) * Fix Azure docker login * fix logging --- sky/provision/azure/config.py | 2 +- sky/templates/azure-ray.yml.j2 | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sky/provision/azure/config.py b/sky/provision/azure/config.py index 5d9385bd73c..146deaa6781 100644 --- a/sky/provision/azure/config.py +++ b/sky/provision/azure/config.py @@ -84,7 +84,7 @@ def bootstrap_instances( logger.info( f'Azure resource group {resource_group} of a recent ' f'terminated cluster {cluster_name_on_cloud} is being ' - 'deleted. It can only be provisioned after it is fully' + 'deleted. It can only be provisioned after it is fully ' 'deleted. Waiting...') time.sleep(1) retry += 1 diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 16eb1d9dd23..39672a976b8 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -17,6 +17,15 @@ docker: {%- for run_option in docker_run_options %} - {{run_option}} {%- endfor %} + {%- if docker_login_config is not none %} + docker_login_config: + username: |- + {{docker_login_config.username}} + password: |- + {{docker_login_config.password}} + server: |- + {{docker_login_config.server}} + {%- endif %} {%- endif %} provider: @@ -28,15 +37,6 @@ provider: # instead of the cluster_name. This ensures that ray creates new instances # for different cluster_name. resource_group: {{resource_group}} -{%- if docker_login_config is not none %} - # We put docker login config in provider section because ray's schema disabled - # additionalProperties for docker config. - # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227 - docker_login_config: - username: {{docker_login_config.username}} - password: {{docker_login_config.password}} - server: {{docker_login_config.server}} -{%- endif %} # Keep (otherwise cannot reuse when re-provisioning). # teardown(terminate=True) will override this. cache_stopped_nodes: True From df2a9b4b8982480340426a4fba0176af443c88b9 Mon Sep 17 00:00:00 2001 From: Kihyo Moon <36657450+hyoxt121@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:12:21 +0900 Subject: [PATCH 3/5] [SCP] Fix the code for openapi changes (#3826) * fix the code for scp openapi change fix the code for scp openapi change * fix the code for scp openapi change fix the code for scp openapi change --- sky/clouds/utils/scp_utils.py | 1 + sky/skylet/providers/scp/config.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sky/clouds/utils/scp_utils.py b/sky/clouds/utils/scp_utils.py index 40c3e4a730e..3e91e22e6d9 100644 --- a/sky/clouds/utils/scp_utils.py +++ b/sky/clouds/utils/scp_utils.py @@ -223,6 +223,7 @@ def _delete(self, url, request_body=None): def create_security_group(self, zone_id, vpc, sg_name): url = f'{API_ENDPOINT}/security-group/v3/security-groups' request_body = { + 'loggable': False, 'securityGroupName': sg_name, 'serviceZoneId': zone_id, 'vpcId': vpc, diff --git a/sky/skylet/providers/scp/config.py b/sky/skylet/providers/scp/config.py index ad4fd6e07e7..c20b1837f26 100644 --- a/sky/skylet/providers/scp/config.py +++ b/sky/skylet/providers/scp/config.py @@ -114,13 +114,21 @@ def get_vcp_subnets(self): def _get_vm_init_script(self, ssh_public_key): + import subprocess init_script_content = self._get_default_config_cmd( ) + self._get_ssh_key_gen_cmd(ssh_public_key) + init_script_content_string = f'"{init_script_content}"' + command = f'echo {init_script_content_string} | base64' + result = subprocess.run(command, + shell=True, + capture_output=True, + text=True) + init_script_content_base64 = result.stdout return { - "encodingType": "plain", + "encodingType": "base64", "initialScriptShell": "bash", "initialScriptType": "text", - "initialScriptContent": init_script_content + "initialScriptContent": init_script_content_base64 } def _get_ssh_key_gen_cmd(self, ssh_public_key): From 69838f5e94b3f45b6b77e0791576be5299da2b01 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 16 Aug 2024 00:27:42 -0700 Subject: [PATCH 4/5] [GCP DWS] Fix None issue when no provision timeout is provided (#3835) * Fix None issue when no provision timeout is provided * raies instead of print * Change to check provision_timeout is none --- sky/provision/gcp/mig_utils.py | 1 + sky/templates/gcp-ray.yml.j2 | 2 ++ 2 files changed, 3 insertions(+) diff --git a/sky/provision/gcp/mig_utils.py b/sky/provision/gcp/mig_utils.py index 9e33f5171e2..3a7d0db1805 100644 --- a/sky/provision/gcp/mig_utils.py +++ b/sky/provision/gcp/mig_utils.py @@ -207,3 +207,4 @@ def wait_for_managed_group_to_be_stable(project_id: str, zone: str, except subprocess.CalledProcessError as e: stderr = e.stderr.decode('ascii') logger.info(stderr) + raise diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index d986adbf6df..d7e787953d9 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -88,7 +88,9 @@ available_node_types: {%- if gcp_use_managed_instance_group %} managed-instance-group: run_duration: {{ run_duration }} + {%- if provision_timeout is defined and provision_timeout is not none %} provision_timeout: {{ provision_timeout }} + {%- endif %} {%- endif %} {%- if specific_reservations %} reservationAffinity: From 61c9c87136360e2bb8d21c73aef745afb6a689ab Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 16 Aug 2024 18:33:06 -0400 Subject: [PATCH 5/5] allow usage of user-defined docker images in runpod (#3728) * allow usage of user-defined docker images in runpod * fix docker image support w runpod * don't install ray, expect the docker image should have it * add TODO for docker image size Co-authored-by: Zhanghao Wu * runpod has imaged support Co-authored-by: Zhanghao Wu * Fix docker ssh setups for custom docker image * fix for axolotl image * Add docs * fix conda commands --------- Co-authored-by: Zhanghao Wu --- docs/source/examples/docker-containers.rst | 3 +- sky/clouds/runpod.py | 18 +++++++++--- sky/provision/runpod/instance.py | 4 ++- sky/provision/runpod/utils.py | 33 ++++++++++++++++++++-- sky/skylet/constants.py | 8 ++++-- sky/templates/runpod-ray.yml.j2 | 3 ++ 6 files changed, 58 insertions(+), 11 deletions(-) diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst index 8bc7ae16837..582db94ee79 100644 --- a/docs/source/examples/docker-containers.rst +++ b/docs/source/examples/docker-containers.rst @@ -10,7 +10,8 @@ SkyPilot can run a container either as a task, or as the runtime environment of .. note:: - Running docker containers is `not supported on RunPod `_. To use RunPod, use ``setup`` and ``run`` to configure your environment. See `GitHub issue `_ for more. + Running docker containers is `not supported on RunPod `_. To use RunPod, either use your docker image (the username should be ``root`` for RunPod) :ref:`as a runtime environment ` or use ``setup`` and ``run`` to configure your environment. See `GitHub issue `_ for more. + .. _docker-containers-as-tasks: diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 30d650de07c..9a6b483619a 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -30,10 +30,6 @@ class RunPod(clouds.Cloud): clouds.CloudImplementationFeatures.MULTI_NODE: ('Multi-node not supported yet, as the interconnection among nodes ' 'are non-trivial on RunPod.'), - clouds.CloudImplementationFeatures.IMAGE_ID: - ('Specifying image ID is not supported on RunPod.'), - clouds.CloudImplementationFeatures.DOCKER_IMAGE: - (f'Docker image is currently not supported on {_REPR}.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: ('Customizing disk tier is not supported yet on RunPod.'), clouds.CloudImplementationFeatures.STORAGE_MOUNTING: @@ -175,10 +171,18 @@ def make_deploy_resources_variables( else: custom_resources = None + if r.image_id is None: + image_id = 'runpod/base:0.0.2' + elif r.extract_docker_image() is not None: + image_id = r.extract_docker_image() + else: + image_id = r.image_id[r.region] + return { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, 'region': region.name, + 'image_id': image_id, } def _get_feasible_launchable_resources( @@ -276,3 +280,9 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]): return service_catalog.validate_region_zone(region, zone, clouds='runpod') + + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: + # TODO: use 0.0 for now to allow all images. We should change this to + # return the docker image size. + return 0.0 diff --git a/sky/provision/runpod/instance.py b/sky/provision/runpod/instance.py index 1f941ab1b9e..ecac748a58f 100644 --- a/sky/provision/runpod/instance.py +++ b/sky/provision/runpod/instance.py @@ -86,7 +86,9 @@ def run_instances(region: str, cluster_name_on_cloud: str, instance_type=config.node_config['InstanceType'], region=region, disk_size=config.node_config['DiskSize'], - ports=config.ports_to_open_on_launch) + image_name=config.node_config['ImageId'], + ports=config.ports_to_open_on_launch, + public_key=config.node_config['PublicKey']) except Exception as e: # pylint: disable=broad-except logger.warning(f'run_instances error: {e}') raise diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py index a53b12b3a20..24af263f13c 100644 --- a/sky/provision/runpod/utils.py +++ b/sky/provision/runpod/utils.py @@ -1,5 +1,6 @@ """RunPod library wrapper for SkyPilot.""" +import base64 import time from typing import Any, Dict, List, Optional @@ -95,7 +96,7 @@ def list_instances() -> Dict[str, Dict[str, Any]]: def launch(name: str, instance_type: str, region: str, disk_size: int, - ports: Optional[List[int]]) -> str: + image_name: str, ports: Optional[List[int]], public_key: str) -> str: """Launches an instance with the given parameters. Converts the instance_type to the RunPod GPU name, finds the specs for the @@ -106,6 +107,31 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, cloud_type = instance_type.split('_')[2] gpu_specs = runpod.runpod.get_gpu(gpu_type) + # TODO(zhwu): keep this align with setups in + # `provision.kuberunetes.instance.py` + setup_cmd = ( + 'prefix_cmd() ' + '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; ' + '$(prefix_cmd) apt update;' + 'export DEBIAN_FRONTEND=noninteractive;' + '$(prefix_cmd) apt install openssh-server rsync curl patch -y;' + '$(prefix_cmd) mkdir -p /var/run/sshd; ' + '$(prefix_cmd) ' + 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" ' + '/etc/ssh/sshd_config; ' + '$(prefix_cmd) sed ' + '"s@session\\s*required\\s*pam_loginuid.so@session optional ' + 'pam_loginuid.so@g" -i /etc/pam.d/sshd; ' + 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; ' + '$(prefix_cmd) mkdir -p ~/.ssh; ' + '$(prefix_cmd) chown -R $(whoami) ~/.ssh;' + '$(prefix_cmd) chmod 700 ~/.ssh; ' + f'$(prefix_cmd) echo "{public_key}" >> ~/.ssh/authorized_keys; ' + '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; ' + '$(prefix_cmd) service ssh restart; ' + '[ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc;sleep infinity') + # Use base64 to deal with the tricky quoting issues caused by runpod API. + encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8') # Port 8081 is occupied for nginx in the base image. custom_ports_str = '' @@ -114,7 +140,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, new_instance = runpod.runpod.create_pod( name=name, - image_name='runpod/base:0.0.2', + image_name=image_name, gpu_type_id=gpu_type, cloud_type=cloud_type, container_disk_in_gb=disk_size, @@ -127,7 +153,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,' f'{constants.SKY_REMOTE_RAY_PORT}/http'), support_public_ip=True, - ) + docker_args= + f'bash -c \'echo {encoded} | base64 --decode > init.sh; bash init.sh\'') return new_instance['id'] diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 84a6491605a..30820a3a91e 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -129,10 +129,14 @@ CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' '{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long - 'bash Miniconda3-Linux-x86_64.sh -b && ' + # We do not use && for installation of conda and the following init commands + # because for some images, conda is already installed, but not initialized. + # In this case, we need to initialize conda and set auto_activate_base to + # true. + '{ bash Miniconda3-Linux-x86_64.sh -b; ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true && ' - f'conda activate base; }}; ' + f'conda activate base; }}; }}; ' 'grep "# >>> conda initialize >>>" ~/.bashrc || ' '{ conda init && source ~/.bashrc; };' # If Python version is larger then equal to 3.12, create a new conda env diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index 8c063ac4f5d..4d9b0637bd0 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -21,6 +21,9 @@ available_node_types: node_config: InstanceType: {{instance_type}} DiskSize: {{disk_size}} + ImageId: {{image_id}} + PublicKey: |- + skypilot:ssh_public_key_content head_node_type: ray_head_default