From ae5380dbec8929fd288fa28373da78543cef0ba4 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Tue, 6 Sep 2022 16:33:39 +0800 Subject: [PATCH 01/10] remove hardcode credentials --- python/orca/src/bigdl/orca/ray/ray_on_spark_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index 0cc377a64bd..70b9287b490 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -329,7 +329,7 @@ def _start_ray_services(iter): class RayOnSparkContext(object): _active_ray_context = None - def __init__(self, sc, redis_port=None, password="123456", object_store_memory=None, + def __init__(self, sc, redis_port=None, password=None, object_store_memory=None, verbose=False, env=None, extra_params=None, include_webui=True, num_ray_nodes=None, ray_node_cpu_cores=None, system_config=None): """ From fac51f1341b52ba3e930f026f9f2153ddadf846a Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Tue, 6 Sep 2022 16:53:35 +0800 Subject: [PATCH 02/10] update --- python/orca/src/bigdl/orca/ray/ray_on_spark_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index 70b9287b490..358120d6229 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -346,7 +346,7 @@ def __init__(self, sc, redis_port=None, password=None, object_store_memory=None, :param sc: An instance of SparkContext. :param redis_port: The redis port for the ray head node. Default is None. The value would be randomly picked if not specified. - :param password: The password for redis. Default to be "123456" if not specified. + :param password: The password for redis. Default to be "None" if not specified. :param object_store_memory: The memory size for ray object_store in string. This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g). For example, "50b", "100k", "250m", "30g". From d39096eb75e47a900ec1bdf61a086e5e5db804bc Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Tue, 6 Sep 2022 17:04:27 +0800 Subject: [PATCH 03/10] add orca ut to PR_validation.yml --- .github/workflows/PR_validation.yml | 70 ++++++++++++++++++++++++++++- .github/workflows/nightly_test.yml | 25 ++++++----- 2 files changed, 82 insertions(+), 13 deletions(-) diff --git a/.github/workflows/PR_validation.yml b/.github/workflows/PR_validation.yml index 3dd2ace0ac8..5c1a490a195 100644 --- a/.github/workflows/PR_validation.yml +++ b/.github/workflows/PR_validation.yml @@ -62,7 +62,6 @@ jobs: - name: Run test uses: ./.github/actions/dllib-scala-ut-action - Dllib-Scala-UT: needs: changes if: ${{ needs.changes.outputs.dllib == 'true' }} @@ -77,6 +76,75 @@ jobs: - name: Run test uses: ./.github/actions/dllib-scala-ut-action + Orca-Python-ExampleTest-Py37-Spark3: + needs: changes + if: ${{ needs.changes.outputs.orca == 'true' }} + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-python-exampletest-action + + Orca-Python-ExampleTest-Ray-Py37-Spark3: + needs: changes + if: ${{ needs.changes.outputs.orca == 'true' }} + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-python-exampletest-ray-action + + Orca-Jep-ExampleTest-Py37-Spark2: + needs: changes + if: ${{ needs.changes.outputs.orca == 'true' }} + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-jep-exampletest-action + + Orca-Python-Ray-Py37-Spark3: + needs: changes + if: ${{ needs.changes.outputs.orca == 'true' }} + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-python-ray-py37-spark3-action + + Orca-Python-Py37-Spark3: + needs: changes + if: ${{ needs.changes.outputs.orca == 'true' }} + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-python-py37-spark3-action Orca-Ray-Ctx-Example: needs: changes diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml index 39496894524..10c7e59e9ae 100644 --- a/.github/workflows/nightly_test.yml +++ b/.github/workflows/nightly_test.yml @@ -42,7 +42,6 @@ jobs: - name: Run test uses: ./.github/actions/orca-python-exampletest-action - Orca-Python-ExampleTest-Ray-Py37-Spark3: if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Python-ExampleTest-Ray-Py37-Spark3' || github.event.inputs.artifact == 'all' }} runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] @@ -95,6 +94,19 @@ jobs: - name: Run test uses: ./.github/actions/orca-python-py37-spark3-action + Orca-Ray-Ctx-Example: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Ray-Ctx-Example' || github.event.inputs.artifact == 'all' }} + runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-ray-ctx-example-action + Dllib-Scala-UT: if: ${{ github.event.schedule || github.event.inputs.artifact == 'Dllib-Scala-UT' || github.event.inputs.artifact == 'all' }} runs-on: [ self-hosted, Gondolin, ubuntu-20.04-lts ] @@ -134,15 +146,4 @@ jobs: - name: Run test uses: ./.github/actions/ppml-scala-ut-action - Orca-Ray-Ctx-Example: - if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Ray-Ctx-Example' || github.event.inputs.artifact == 'all' }} - runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-ray-ctx-example-action From 2f075cc2095a8c3c098d580ad72552e345097c13 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Wed, 7 Sep 2022 16:07:15 +0800 Subject: [PATCH 04/10] use redis_paasword in extra_params otherwise use the default password in ray --- python/orca/src/bigdl/orca/common.py | 2 +- .../bigdl/orca/ray/ray_on_spark_context.py | 54 ++++++++++--------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/python/orca/src/bigdl/orca/common.py b/python/orca/src/bigdl/orca/common.py index 8c7cb571726..0b5b5a71a20 100644 --- a/python/orca/src/bigdl/orca/common.py +++ b/python/orca/src/bigdl/orca/common.py @@ -341,7 +341,7 @@ def init_orca_context(cluster_mode=None, runtime="spark", cores=None, memory="2g "spark-submit or bigdl-submit, " "but got: %s".format(cluster_mode)) ray_args = {} - for key in ["redis_port", "password", "object_store_memory", "verbose", "env", + for key in ["redis_port", "object_store_memory", "verbose", "env", "extra_params", "num_ray_nodes", "ray_node_cpu_cores", "include_webui", "system_config"]: if key in kwargs: diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index 358120d6229..7e983dbc6a0 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -108,13 +108,12 @@ def _prepare_env(self): return modified_env def __init__(self, python_loc, redis_port, ray_node_cpu_cores, - password, object_store_memory, verbose=False, env=None, + object_store_memory, verbose=False, env=None, include_webui=False, extra_params=None, system_config=None): """object_store_memory: integer in bytes""" self.env = env self.python_loc = python_loc self.redis_port = redis_port - self.password = password self.ray_node_cpu_cores = ray_node_cpu_cores self.ray_exec = self._get_ray_exec() self.object_store_memory = object_store_memory @@ -150,17 +149,17 @@ def _enrich_command(command, object_store_memory, extra_params): if object_store_memory: command = command + " --object-store-memory {}".format(str(object_store_memory)) if extra_params: - for pair in extra_params.items(): - command = command + " --{} {}".format(pair[0], pair[1]) + for k, v in extra_params.items(): + kw = k.replace("_", "-") + command = command + " --{} {}".format(kw, v) return command def _gen_master_command(self): webui = "true" if self.include_webui else "false" command = "{} start --head " \ "--include-dashboard {} --dashboard-host 0.0.0.0 --port {} " \ - "--redis-password {} --num-cpus {}". \ - format(self.ray_exec, webui, self.redis_port, self.password, - self.ray_node_cpu_cores) + "--num-cpus {}". \ + format(self.ray_exec, webui, self.redis_port, self.ray_node_cpu_cores) if self.labels: command = command + " " + self.labels if self.system_config: @@ -173,13 +172,12 @@ def _gen_master_command(self): @staticmethod def _get_raylet_command(redis_address, ray_exec, - password, ray_node_cpu_cores, labels="", object_store_memory=None, extra_params=None): - command = "{} start --address {} --redis-password {} --num-cpus {}".format( - ray_exec, redis_address, password, ray_node_cpu_cores) + command = "{} start --address {} --num-cpus {}".format( + ray_exec, redis_address, ray_node_cpu_cores) if labels: command = command + " " + labels return RayServiceFuncGenerator._enrich_command(command=command, @@ -262,7 +260,6 @@ def _start_raylets(iter): command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, - password=self.password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, @@ -312,7 +309,6 @@ def _start_ray_services(iter): command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, - password=self.password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, @@ -329,7 +325,7 @@ def _start_ray_services(iter): class RayOnSparkContext(object): _active_ray_context = None - def __init__(self, sc, redis_port=None, password=None, object_store_memory=None, + def __init__(self, sc, redis_port=None, object_store_memory=None, verbose=False, env=None, extra_params=None, include_webui=True, num_ray_nodes=None, ray_node_cpu_cores=None, system_config=None): """ @@ -346,14 +342,13 @@ def __init__(self, sc, redis_port=None, password=None, object_store_memory=None, :param sc: An instance of SparkContext. :param redis_port: The redis port for the ray head node. Default is None. The value would be randomly picked if not specified. - :param password: The password for redis. Default to be "None" if not specified. :param object_store_memory: The memory size for ray object_store in string. This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g). For example, "50b", "100k", "250m", "30g". :param verbose: True for more logs when starting ray. Default is False. :param env: The environment variable dict for running ray processes. Default is None. :param extra_params: The key value dict for extra options to launch ray. - For example, extra_params={"temp-dir": "/tmp/ray/"} + For example, extra_params={"temp-dir": "/tmp/ray/", "redis-password": "xxxxxx"} :param include_webui: True for including web ui when starting ray. Default is False. :param num_ray_nodes: The number of raylets to start across the cluster. For Spark local mode, you don't need to specify this value. @@ -379,7 +374,6 @@ def __init__(self, sc, redis_port=None, password=None, object_store_memory=None, self.initialized = False self.is_local = is_local(sc) self.verbose = verbose - self.redis_password = password self.object_store_memory = resource_to_bytes(object_store_memory) self.ray_processesMonitor = None self.env = env @@ -463,7 +457,6 @@ def setup(self): python_loc=self.python_loc, redis_port=self.redis_port, ray_node_cpu_cores=self.ray_node_cpu_cores, - password=self.redis_password, object_store_memory=self.object_store_memory, verbose=self.verbose, env=self.env, @@ -529,6 +522,14 @@ def _get_spark_local_cores(self): else: return int(local_symbol) + def _update_extra_params(self, extra_params): + kwargs = {} + if extra_params is not None: + for k, v in extra_params.items(): + kw = k.replace("-", "_") + kwargs[kw] = v + return kwargs + def init(self, driver_cores=0): """ Initiate the ray cluster. @@ -546,19 +547,17 @@ def init(self, driver_cores=0): if self.env: os.environ.update(self.env) import ray - kwargs = {} - if self.extra_params is not None: - for k, v in self.extra_params.items(): - kw = k.replace("-", "_") - kwargs[kw] = v + kwargs = self._update_extra_params(self.extra_params) init_params = dict( num_cpus=self.ray_node_cpu_cores, - _redis_password=self.redis_password, object_store_memory=self.object_store_memory, include_dashboard=self.include_webui, dashboard_host="0.0.0.0", _system_config=self.system_config ) + if "redis_password" in kwargs: + kwargs["_redis_password"] = kwargs["redis_password"] + kwargs.pop("redis_password") init_params.update(kwargs) if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" @@ -624,7 +623,6 @@ def _start_restricted_worker(self, num_cores, node_ip_address, redis_address): command = RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec="ray", - password=self.redis_password, ray_node_cpu_cores=num_cores, object_store_memory=self.object_store_memory, extra_params=extra_param) @@ -637,18 +635,22 @@ def _start_restricted_worker(self, num_cores, node_ip_address, redis_address): pgid_to_kill=process_info.pgid) def _start_driver(self, num_cores, redis_address): - print("Start to launch ray driver on local") + print("Start to launch ray driver") import ray._private.services node_ip = ray._private.services.get_node_ip_address(redis_address) self._start_restricted_worker(num_cores=num_cores, node_ip_address=node_ip, redis_address=redis_address) ray.shutdown() + kwargs = self._update_extra_params(self.extra_params) init_params = dict( address=redis_address, - _redis_password=self.ray_service.password, _node_ip_address=node_ip ) + if "redis_password" in kwargs: + kwargs["_redis_password"] = kwargs["redis_password"] + kwargs.pop("redis_password") + init_params.update(kwargs) if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" return ray.init(**init_params) From b637aed85462b620b296b0d912cc55106247b674 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Wed, 7 Sep 2022 16:56:14 +0800 Subject: [PATCH 05/10] refine code --- python/orca/src/bigdl/orca/ray/ray_on_spark_context.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index 7e983dbc6a0..f32d7e40dd0 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -528,6 +528,9 @@ def _update_extra_params(self, extra_params): for k, v in extra_params.items(): kw = k.replace("-", "_") kwargs[kw] = v + if "redis_password" in kwargs: + kwargs["_redis_password"] = kwargs["redis_password"] + kwargs.pop("redis_password") return kwargs def init(self, driver_cores=0): @@ -555,9 +558,6 @@ def init(self, driver_cores=0): dashboard_host="0.0.0.0", _system_config=self.system_config ) - if "redis_password" in kwargs: - kwargs["_redis_password"] = kwargs["redis_password"] - kwargs.pop("redis_password") init_params.update(kwargs) if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" @@ -647,9 +647,6 @@ def _start_driver(self, num_cores, redis_address): address=redis_address, _node_ip_address=node_ip ) - if "redis_password" in kwargs: - kwargs["_redis_password"] = kwargs["redis_password"] - kwargs.pop("redis_password") init_params.update(kwargs) if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" From 850ea236bc6d8d4fd9ede19c0bc0c65d3ad0b0d6 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Thu, 8 Sep 2022 15:20:47 +0800 Subject: [PATCH 06/10] update --- python/orca/src/bigdl/orca/common.py | 2 +- .../orca/src/bigdl/orca/data/ray_xshards.py | 3 +- .../bigdl/orca/ray/ray_on_spark_context.py | 29 +++++++++++++------ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/python/orca/src/bigdl/orca/common.py b/python/orca/src/bigdl/orca/common.py index 0b5b5a71a20..6e0664748c9 100644 --- a/python/orca/src/bigdl/orca/common.py +++ b/python/orca/src/bigdl/orca/common.py @@ -341,7 +341,7 @@ def init_orca_context(cluster_mode=None, runtime="spark", cores=None, memory="2g "spark-submit or bigdl-submit, " "but got: %s".format(cluster_mode)) ray_args = {} - for key in ["redis_port", "object_store_memory", "verbose", "env", + for key in ["redis_port", "redis_password", "object_store_memory", "verbose", "env", "extra_params", "num_ray_nodes", "ray_node_cpu_cores", "include_webui", "system_config"]: if key in kwargs: diff --git a/python/orca/src/bigdl/orca/data/ray_xshards.py b/python/orca/src/bigdl/orca/data/ray_xshards.py index 55c8a4b1260..5a0c80aa55d 100644 --- a/python/orca/src/bigdl/orca/data/ray_xshards.py +++ b/python/orca/src/bigdl/orca/data/ray_xshards.py @@ -85,9 +85,10 @@ def init_ray_if_not(redis_address, redis_password): if not ray.is_initialized(): init_params = dict( address=redis_address, - _redis_password=redis_password, ignore_reinit_error=True ) + if redis_password: + init_params["_redis_password"] = self.redis_password if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" ray.init(**init_params) diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index f32d7e40dd0..343ebf4ef6a 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -107,13 +107,14 @@ def _prepare_env(self): print("The $PATH is: {}".format(modified_env["PATH"])) return modified_env - def __init__(self, python_loc, redis_port, ray_node_cpu_cores, + def __init__(self, python_loc, redis_port, redis_password, ray_node_cpu_cores, object_store_memory, verbose=False, env=None, include_webui=False, extra_params=None, system_config=None): """object_store_memory: integer in bytes""" self.env = env self.python_loc = python_loc self.redis_port = redis_port + self.redis_password = redis_password self.ray_node_cpu_cores = ray_node_cpu_cores self.ray_exec = self._get_ray_exec() self.object_store_memory = object_store_memory @@ -158,8 +159,10 @@ def _gen_master_command(self): webui = "true" if self.include_webui else "false" command = "{} start --head " \ "--include-dashboard {} --dashboard-host 0.0.0.0 --port {} " \ - "--num-cpus {}". \ + "--num-cpus {}" \ format(self.ray_exec, webui, self.redis_port, self.ray_node_cpu_cores) + if self.redis_password: + command = command + " --redis-password {}".format(self.redis_password) if self.labels: command = command + " " + self.labels if self.system_config: @@ -172,12 +175,15 @@ def _gen_master_command(self): @staticmethod def _get_raylet_command(redis_address, ray_exec, + redis_password, ray_node_cpu_cores, labels="", object_store_memory=None, extra_params=None): command = "{} start --address {} --num-cpus {}".format( ray_exec, redis_address, ray_node_cpu_cores) + if redis_password: + command = command + " --redis-password {}".format(redis_password) if labels: command = command + " " + labels return RayServiceFuncGenerator._enrich_command(command=command, @@ -260,6 +266,7 @@ def _start_raylets(iter): command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, + redis_password=self.redis_password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, @@ -309,6 +316,7 @@ def _start_ray_services(iter): command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, + redis_password=self.redis_password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, @@ -325,7 +333,7 @@ def _start_ray_services(iter): class RayOnSparkContext(object): _active_ray_context = None - def __init__(self, sc, redis_port=None, object_store_memory=None, + def __init__(self, sc, redis_port=None, redis_password=None, object_store_memory=None, verbose=False, env=None, extra_params=None, include_webui=True, num_ray_nodes=None, ray_node_cpu_cores=None, system_config=None): """ @@ -342,13 +350,14 @@ def __init__(self, sc, redis_port=None, object_store_memory=None, :param sc: An instance of SparkContext. :param redis_port: The redis port for the ray head node. Default is None. The value would be randomly picked if not specified. + :param redis_password: The password for redis. Default to be None if not specified. :param object_store_memory: The memory size for ray object_store in string. This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g). For example, "50b", "100k", "250m", "30g". :param verbose: True for more logs when starting ray. Default is False. :param env: The environment variable dict for running ray processes. Default is None. :param extra_params: The key value dict for extra options to launch ray. - For example, extra_params={"temp-dir": "/tmp/ray/", "redis-password": "xxxxxx"} + For example, extra_params={"temp-dir": "/tmp/ray/"} :param include_webui: True for including web ui when starting ray. Default is False. :param num_ray_nodes: The number of raylets to start across the cluster. For Spark local mode, you don't need to specify this value. @@ -374,6 +383,7 @@ def __init__(self, sc, redis_port=None, object_store_memory=None, self.initialized = False self.is_local = is_local(sc) self.verbose = verbose + self.redis_password = redis_password self.object_store_memory = resource_to_bytes(object_store_memory) self.ray_processesMonitor = None self.env = env @@ -456,6 +466,7 @@ def setup(self): self.ray_service = RayServiceFuncGenerator( python_loc=self.python_loc, redis_port=self.redis_port, + redis_password=self.redis_password, ray_node_cpu_cores=self.ray_node_cpu_cores, object_store_memory=self.object_store_memory, verbose=self.verbose, @@ -528,9 +539,6 @@ def _update_extra_params(self, extra_params): for k, v in extra_params.items(): kw = k.replace("-", "_") kwargs[kw] = v - if "redis_password" in kwargs: - kwargs["_redis_password"] = kwargs["redis_password"] - kwargs.pop("redis_password") return kwargs def init(self, driver_cores=0): @@ -558,6 +566,8 @@ def init(self, driver_cores=0): dashboard_host="0.0.0.0", _system_config=self.system_config ) + if self.redis_password: + init_params["_redis_password"] = self.redis_password init_params.update(kwargs) if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" @@ -623,6 +633,7 @@ def _start_restricted_worker(self, num_cores, node_ip_address, redis_address): command = RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec="ray", + redis_password=self.redis_password, ray_node_cpu_cores=num_cores, object_store_memory=self.object_store_memory, extra_params=extra_param) @@ -642,12 +653,12 @@ def _start_driver(self, num_cores, redis_address): node_ip_address=node_ip, redis_address=redis_address) ray.shutdown() - kwargs = self._update_extra_params(self.extra_params) init_params = dict( address=redis_address, _node_ip_address=node_ip ) - init_params.update(kwargs) + if self.redis_password: + init_params["_redis_password"] = self.redis_password if version.parse(ray.__version__) >= version.parse("1.4.0"): init_params["namespace"] = "az" return ray.init(**init_params) From efabbcb474fd4a5c4b7445fbb59af3786ab9a98f Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Thu, 8 Sep 2022 15:29:04 +0800 Subject: [PATCH 07/10] typo --- python/orca/src/bigdl/orca/ray/ray_on_spark_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py index 343ebf4ef6a..3f64e5130a0 100644 --- a/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py +++ b/python/orca/src/bigdl/orca/ray/ray_on_spark_context.py @@ -159,7 +159,7 @@ def _gen_master_command(self): webui = "true" if self.include_webui else "false" command = "{} start --head " \ "--include-dashboard {} --dashboard-host 0.0.0.0 --port {} " \ - "--num-cpus {}" \ + "--num-cpus {}". \ format(self.ray_exec, webui, self.redis_port, self.ray_node_cpu_cores) if self.redis_password: command = command + " --redis-password {}".format(self.redis_password) From fe11da99ed55ea46c9dfae685dcfc1523d1f8fe9 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Thu, 8 Sep 2022 18:27:22 +0800 Subject: [PATCH 08/10] remove orca ut from pr validation --- .github/workflows/PR_validation.yml | 71 ----------------------------- 1 file changed, 71 deletions(-) diff --git a/.github/workflows/PR_validation.yml b/.github/workflows/PR_validation.yml index 5c1a490a195..e5cae2a9ad6 100644 --- a/.github/workflows/PR_validation.yml +++ b/.github/workflows/PR_validation.yml @@ -76,77 +76,6 @@ jobs: - name: Run test uses: ./.github/actions/dllib-scala-ut-action - Orca-Python-ExampleTest-Py37-Spark3: - needs: changes - if: ${{ needs.changes.outputs.orca == 'true' }} - runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-python-exampletest-action - - Orca-Python-ExampleTest-Ray-Py37-Spark3: - needs: changes - if: ${{ needs.changes.outputs.orca == 'true' }} - runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-python-exampletest-ray-action - - Orca-Jep-ExampleTest-Py37-Spark2: - needs: changes - if: ${{ needs.changes.outputs.orca == 'true' }} - runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-jep-exampletest-action - - Orca-Python-Ray-Py37-Spark3: - needs: changes - if: ${{ needs.changes.outputs.orca == 'true' }} - runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-python-ray-py37-spark3-action - - Orca-Python-Py37-Spark3: - needs: changes - if: ${{ needs.changes.outputs.orca == 'true' }} - runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-python-py37-spark3-action - - Orca-Ray-Ctx-Example: needs: changes if: ${{ needs.changes.outputs.orca == 'true' }} runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] From d6eb9fa10e810a32c89def58bfba0799f5558024 Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Tue, 13 Sep 2022 09:22:24 +0800 Subject: [PATCH 09/10] update --- .github/workflows/PR_validation.yml | 1 + .github/workflows/nightly_test.yml | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/PR_validation.yml b/.github/workflows/PR_validation.yml index e5cae2a9ad6..d20dfabc30d 100644 --- a/.github/workflows/PR_validation.yml +++ b/.github/workflows/PR_validation.yml @@ -76,6 +76,7 @@ jobs: - name: Run test uses: ./.github/actions/dllib-scala-ut-action + Orca-Ray-Ctx-Example: needs: changes if: ${{ needs.changes.outputs.orca == 'true' }} runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml index 10c7e59e9ae..91d8009fd83 100644 --- a/.github/workflows/nightly_test.yml +++ b/.github/workflows/nightly_test.yml @@ -146,4 +146,15 @@ jobs: - name: Run test uses: ./.github/actions/ppml-scala-ut-action - + create-workflow-badge: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: create workflow badge + if: ${{ always() }} + uses: ./.github/actions/create-job-status-badge + with: + secret: ${{ secrets.GIST_SECRET }} + gist-id: bc8a699b455bced4a1aef138ad5df07e + file-name: nightly-test.json + type: workflow From 8addaad26b1a8fbf9a0370c15c1313ee62845d5f Mon Sep 17 00:00:00 2001 From: Shaojun Liu Date: Tue, 13 Sep 2022 09:25:54 +0800 Subject: [PATCH 10/10] update --- .github/workflows/nightly_test.yml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml index 91d8009fd83..30259ca1eec 100644 --- a/.github/workflows/nightly_test.yml +++ b/.github/workflows/nightly_test.yml @@ -42,6 +42,7 @@ jobs: - name: Run test uses: ./.github/actions/orca-python-exampletest-action + Orca-Python-ExampleTest-Ray-Py37-Spark3: if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Python-ExampleTest-Ray-Py37-Spark3' || github.event.inputs.artifact == 'all' }} runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] @@ -94,19 +95,6 @@ jobs: - name: Run test uses: ./.github/actions/orca-python-py37-spark3-action - Orca-Ray-Ctx-Example: - if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Ray-Ctx-Example' || github.event.inputs.artifact == 'all' }} - runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] - - steps: - - uses: actions/checkout@v3 - - name: Set up JDK8 - uses: ./.github/actions/jdk-setup-action - - name: Set up maven - uses: ./.github/actions/maven-setup-action - - name: Run test - uses: ./.github/actions/orca-ray-ctx-example-action - Dllib-Scala-UT: if: ${{ github.event.schedule || github.event.inputs.artifact == 'Dllib-Scala-UT' || github.event.inputs.artifact == 'all' }} runs-on: [ self-hosted, Gondolin, ubuntu-20.04-lts ] @@ -146,6 +134,19 @@ jobs: - name: Run test uses: ./.github/actions/ppml-scala-ut-action + Orca-Ray-Ctx-Example: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'Orca-Ray-Ctx-Example' || github.event.inputs.artifact == 'all' }} + runs-on: [self-hosted, Gondolin-resources, ubuntu-20.04-lts] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: Run test + uses: ./.github/actions/orca-ray-ctx-example-action + create-workflow-badge: runs-on: ubuntu-latest steps: @@ -157,4 +158,4 @@ jobs: secret: ${{ secrets.GIST_SECRET }} gist-id: bc8a699b455bced4a1aef138ad5df07e file-name: nightly-test.json - type: workflow + type: workflow \ No newline at end of file