diff --git a/.gitignore b/.gitignore index 29c033d7..ec9b74d6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ poetry.lock *.gvmi.descr.bin .requirements.txt .python-version +golem-cluster.dev.yaml +golem-cluster.local.yaml diff --git a/golem-cluster-dev.yaml b/golem-cluster-dev.yaml deleted file mode 100644 index a482360f..00000000 --- a/golem-cluster-dev.yaml +++ /dev/null @@ -1,124 +0,0 @@ -# Ray on Golem cluster name -cluster_name: golem-cluster - -# The maximum number of workers the cluster will have at any given time -max_workers: 10 - -# The number of minutes that need to pass before an idle worker node is removed by the Autoscaler -idle_timeout_minutes: 2 - -# The cloud provider-specific configuration properties. -provider: - type: "external" - use_internal_ips: true - module: "ray_on_golem.provider.node_provider.GolemNodeProvider" - parameters: - # Port of golem webserver that has connection with golem network - webserver_port: 4578 - - enable_registry_stats: false - - # Blockchain used for payments. - # Goerli means running free nodes on testnet, - # Polygon is for mainnet operations. - network: "goerli" - - # Maximum amount of GLMs that's going to be spent for the whole cluster - budget: 2 - - # Params for creating golem demands (same for head and workers) - node_config: - demand: - # if not provided, image_tag will be autodetected based on currently used python and ray versions - # check available versions at https://registry.golem.network/explore/golem/ray-on-golem - image_tag: "blueshade/ray-on-golem:0.2.1-py3.10.13-ray2.7.1" - capabilities: ["vpn", "inet", "manifest-support"] - min_mem_gib: 0 - min_cpu_threads: 0 - min_storage_gib: 0 - - cost_management: # TODO: Consider more suitable parameter name - # Estimated average load and duration for worker that tells cost management to pick the least expensive Golem provider offers first. - # If not provided, offers will be picked at random. - # Both values need to be defined or undefined together. - average_cpu_load: 0.8 - average_duration_minutes: 20 - - # Amount of GLMs for average usage which Golem provider offer will be rejected if exceeded. - # Requires "average_cpu_load" and "average_duration_minutes" parameters to take effect. - max_average_usage_cost: 1.5 - - # Amount of GLMs for worker initiation which Golem provider offer will be rejected if exceeded. - max_initial_price: 0.5 - - # Amount of GLMs for CPU utilisation per second which Golem provider offer will be rejected if exceeded. - max_cpu_sec_price: 0.0005 - - # Amount of GLMs for each second that worker runs which Golem provider offer will be rejected if exceeded. - max_duration_sec_price: 0.0005 - -# The files or directories to copy to the head and worker nodes -file_mounts: - # remote_path: local_path - { - "/app/ray_on_golem": "./ray_on_golem", - "/app/golem": "../golem-core-python/golem", - } - -rsync_exclude: [ - "**/__pycache__", -] - -# Tells the autoscaler the allowed node types and the resources they provide -available_node_types: - ray.head.default: - # The minimum number of worker nodes of this type to launch - min_workers: 0 - - # The maximum number of worker nodes of this type to launch - max_workers: 0 - - # The node type's CPU and GPU resources - resources: {"CPU": 1} - - node_config: {} # TODO: Demand description here - ray.worker.default: - min_workers: 1 - max_workers: 10 - resources: {"CPU": 1} - node_config: {} - -# List of commands that will be run to initialize the nodes (before `setup_commands`) -initialization_commands: [ - "cp -fR /app/golem/* $(python -c 'import site; print(site.getsitepackages()[0])')/golem" -] - -# List of shell commands to run to set up nodes -setup_commands: [] - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: [] - -# Command to start ray on the head node. You don't need to change this. -head_start_ray_commands: [ - "ray start --head --node-ip-address $NODE_IP --include-dashboard=True --dashboard-host 0.0.0.0 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml", -] - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: [ - "ray start --address $RAY_HEAD_IP:6379", -] - -# Authentication credentials that Ray will use to launch nodes -# auth: - # Custom username for ssh - # ssh_user: "root" - - # If ssh_private_key will be not provided, temporary key will be created and used - # ssh_private_key: "~/.ssh/id_rsa" - -# A list of paths to the files or directories to copy from the head node to the worker nodes -cluster_synced_files: [] diff --git a/golem-cluster.override.1-source-files.yaml b/golem-cluster.override.1-source-files.yaml new file mode 100644 index 00000000..bf98ce6c --- /dev/null +++ b/golem-cluster.override.1-source-files.yaml @@ -0,0 +1,16 @@ +# The files or directories to copy to the head and worker nodes +file_mounts: + # remote_path: local_path + { + "/app/ray_on_golem": "./ray_on_golem", + "/app/golem": "../golem-core-python/golem", + } + +rsync_exclude: [ + "**/__pycache__", +] + +# List of commands that will be run to initialize the nodes (before `setup_commands`) +initialization_commands: [ + "cp -fR /app/golem/* $(python -c 'import site; print(site.getsitepackages()[0])')/golem" +] diff --git a/golem-cluster.override.2-image.yaml b/golem-cluster.override.2-image.yaml new file mode 100644 index 00000000..ea7d1a1b --- /dev/null +++ b/golem-cluster.override.2-image.yaml @@ -0,0 +1,5 @@ +provider: + parameters: + node_config: + demand: + image_tag: "blueshade/ray-on-golem:0.2.1-py3.10.13-ray2.7.1" diff --git a/golem-cluster.override.3-disable-stats.yaml b/golem-cluster.override.3-disable-stats.yaml new file mode 100644 index 00000000..127ea2a2 --- /dev/null +++ b/golem-cluster.override.3-disable-stats.yaml @@ -0,0 +1,2 @@ +provider: + enable_registry_stats: false diff --git a/golem-cluster.yaml b/golem-cluster.yaml index 4013f4a2..d2f04268 100644 --- a/golem-cluster.yaml +++ b/golem-cluster.yaml @@ -21,11 +21,11 @@ provider: # Polygon is for mainnet operations. network: "goerli" - # Maximum amount of GML that's going to be spent (not supported yet) + # Maximum amount of GLMs that's going to be spent for the whole cluster budget: 2 - # Params for creating golem demands (same for head and workers) node_config: + # Parameters for golem demands (same for head and workers) demand: # if not provided, image_tag will be autodetected based on currently used python and ray versions # check available versions at https://registry.golem.network/explore/golem/ray-on-golem diff --git a/pyproject.toml b/pyproject.toml index a96892b8..06c44870 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ray-on-golem" -version = "0.2.1" +version = "0.3.0-dev" description = "Golem Network integration with Ray" authors = ["Golem Factory "] readme = "README.md" @@ -44,6 +44,7 @@ black = "^23.7.0" isort = "^5.12.0" autoflake = "^2.2.0" gvmkit-build = "^0.3.13" +dpath = "^2.1.6" yamlpath = "^3.8.1" [build-system] @@ -58,6 +59,7 @@ _format_black = "black ." check_license = {sequence = ["_check_license_export", "_check_license_verify"], help = "Check license compatibility"} _check_license_export = "poetry export -f requirements.txt -o .requirements.txt" _check_license_verify = "liccheck -r .requirements.txt" +dev_yaml = {cmd = "python -m utils.apply_overrides -o golem-cluster.dev.yaml golem-cluster.override.*", help="Generate development YAML file."} [tool.isort] profile = "black" diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/apply_overrides.py b/utils/apply_overrides.py new file mode 100644 index 00000000..e0d3262b --- /dev/null +++ b/utils/apply_overrides.py @@ -0,0 +1,45 @@ +import argparse +from pathlib import Path +from typing import Final +import sys +import yaml + +from .yaml import load_yamls + +BASE_YAML: Final = Path("golem-cluster.yaml") +LOCAL_OVERRIDE_YAML: Final = Path("golem-cluster.local.yaml") + + +def main(): + parser = argparse.ArgumentParser("Apply YAML overrides and output a complete YAML.") + parser.add_argument("overrides", type=Path, nargs="*", help="Overrides to apply") + parser.add_argument( + "--base", "-b", type=Path, default=BASE_YAML, help="Base YAML file, default: %(default)s" + ) + parser.add_argument( + "--local", + "-l", + type=Path, + default=LOCAL_OVERRIDE_YAML, + help="Local override file, default: %(default)s", + ) + parser.add_argument("--out", "-o", type=Path, help="Output file, default: stdout") + args = parser.parse_args() + + yaml_files = [args.base] + yaml_files.extend(args.overrides) + + if args.local.exists(): + yaml_files.append(args.local) + + data = load_yamls(*yaml_files) + + if args.out: + with open(args.out, "w") as f: + yaml.dump(data, f) + else: + yaml.dump(data, sys.stdout) + + +if __name__ == "__main__": + main() diff --git a/utils/yaml.py b/utils/yaml.py new file mode 100644 index 00000000..17cc96e3 --- /dev/null +++ b/utils/yaml.py @@ -0,0 +1,29 @@ +from pathlib import Path +from typing import Any, Dict + +import dpath.util +import yaml + + +def load_yamls(*yaml_paths: Path) -> Dict[str, Any]: + """Load the provided YAML files, merging their contents in a deep manner. + + The order of the files is relevant, that is: the first YAML is considered the base. + All the remaining files are loaded one by one and deeply merged into the base. + + Returns a dict representing the result of all YAML files merged into the first one. + """ + + def _load_yaml(path: Path) -> Dict[str, Any]: + with path.open() as f: + return yaml.load(f, yaml.SafeLoader) + + base_dict = _load_yaml(yaml_paths[0]) + for path in yaml_paths[1:]: + data = _load_yaml(path) + dpath.util.merge( + base_dict, + data, + ) + + return base_dict