From 6df44e3f7998b2f87869de895e375b1502fbdcbf Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Sat, 30 Dec 2023 00:06:26 -0800 Subject: [PATCH] Cluster restart model auto redeploy (#1627) (#1827) * Fix cluster level restart model not auto redeploy issue Signed-off-by: zane-neo * Fix cluster level restart model not auto redeploy issuee Signed-off-by: zane-neo * remove unuseful changes Signed-off-by: zane-neo * format code Signed-off-by: zane-neo * Add auto redeploy success ratio configuration Signed-off-by: zane-neo * Add start cron job log Signed-off-by: zane-neo --------- Signed-off-by: zane-neo (cherry picked from commit 034a21295cbccfd38b280e05e4c90ceccda34d2c) Co-authored-by: zane-neo --- .../autoredeploy/MLModelAutoReDeployer.java | 22 +++++++++++-- .../MLCommonsClusterManagerEventListener.java | 31 ++++++++++++++++--- .../ml/plugin/MachineLearningPlugin.java | 4 ++- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java b/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java index dc322f2836..ff774ac0be 100644 --- a/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java +++ b/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java @@ -50,6 +50,7 @@ import lombok.Builder; import lombok.Data; +import lombok.Setter; import lombok.extern.log4j.Log4j2; @Log4j2 @@ -68,6 +69,9 @@ public class MLModelAutoReDeployer { private final SearchRequestBuilderFactory searchRequestBuilderFactory; + @Setter + private ActionListener startCronJobListener; + public MLModelAutoReDeployer( ClusterService clusterService, Client client, @@ -126,6 +130,7 @@ Consumer undeployModelsOnDataNodesConsumer() { public void buildAutoReloadArrangement(List addedNodes, String clusterManagerNodeId) { if (!enableAutoReDeployModel) { log.info("Model auto reload configuration is false, not performing auto reloading!"); + startCronjobAndClearListener(); return; } String localNodeId = clusterService.localNode().getId(); @@ -142,10 +147,12 @@ public void buildAutoReloadArrangement(List addedNodes, String clusterMa public void redeployAModel() { if (!enableAutoReDeployModel) { log.info("Model auto reload configuration is false, not performing auto reloading!"); + startCronjobAndClearListener(); return; } if (modelAutoRedeployArrangements.size() == 0) { log.info("No models needs to be auto redeployed!"); + startCronjobAndClearListener(); return; } ModelAutoRedeployArrangement modelAutoRedeployArrangement = modelAutoRedeployArrangements.poll(); @@ -176,9 +183,10 @@ private void triggerAutoDeployModels(List addedNodes) { }); redeployAModel(); } - }, - e -> { log.error("Failed to query need auto redeploy models, no action will be performed, addedNodes are: {}", addedNodes, e); } - ); + }, e -> { + log.error("Failed to query need auto redeploy models, no action will be performed, addedNodes are: {}", addedNodes, e); + startCronjobAndClearListener(); + }); queryRunningModels(listener); } @@ -296,6 +304,14 @@ private void triggerModelRedeploy(ModelAutoRedeployArrangement modelAutoRedeploy client.execute(MLDeployModelAction.INSTANCE, deployModelRequest, listener); } + private void startCronjobAndClearListener() { + boolean managerNode = clusterService.localNode().isClusterManagerNode(); + if (managerNode && startCronJobListener != null) { + startCronJobListener.onResponse(true); + startCronJobListener = null; + } + } + @Data @Builder static class ModelAutoRedeployArrangement { diff --git a/plugin/src/main/java/org/opensearch/ml/cluster/MLCommonsClusterManagerEventListener.java b/plugin/src/main/java/org/opensearch/ml/cluster/MLCommonsClusterManagerEventListener.java index 327ae2ddae..0cf4215a23 100644 --- a/plugin/src/main/java/org/opensearch/ml/cluster/MLCommonsClusterManagerEventListener.java +++ b/plugin/src/main/java/org/opensearch/ml/cluster/MLCommonsClusterManagerEventListener.java @@ -8,12 +8,16 @@ import static org.opensearch.ml.plugin.MachineLearningPlugin.GENERAL_THREAD_POOL; import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_SYNC_UP_JOB_INTERVAL_IN_SECONDS; +import java.util.List; + import org.opensearch.client.Client; import org.opensearch.cluster.LocalNodeClusterManagerListener; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.lifecycle.LifecycleListener; import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.action.ActionListener; +import org.opensearch.ml.autoredeploy.MLModelAutoReDeployer; import org.opensearch.ml.engine.encryptor.Encryptor; import org.opensearch.ml.engine.indices.MLIndicesHandler; import org.opensearch.threadpool.Scheduler; @@ -35,6 +39,8 @@ public class MLCommonsClusterManagerEventListener implements LocalNodeClusterMan private volatile Integer jobInterval; + private final MLModelAutoReDeployer mlModelAutoReDeployer; + public MLCommonsClusterManagerEventListener( ClusterService clusterService, Client client, @@ -42,7 +48,8 @@ public MLCommonsClusterManagerEventListener( ThreadPool threadPool, DiscoveryNodeHelper nodeHelper, MLIndicesHandler mlIndicesHandler, - Encryptor encryptor + Encryptor encryptor, + MLModelAutoReDeployer modelAutoReDeployer ) { this.clusterService = clusterService; this.client = client; @@ -51,6 +58,7 @@ public MLCommonsClusterManagerEventListener( this.nodeHelper = nodeHelper; this.mlIndicesHandler = mlIndicesHandler; this.encryptor = encryptor; + this.mlModelAutoReDeployer = modelAutoReDeployer; this.jobInterval = ML_COMMONS_SYNC_UP_JOB_INTERVAL_IN_SECONDS.get(settings); clusterService.getClusterSettings().addSettingsUpdateConsumer(ML_COMMONS_SYNC_UP_JOB_INTERVAL_IN_SECONDS, it -> { @@ -62,13 +70,28 @@ public MLCommonsClusterManagerEventListener( @Override public void onClusterManager() { - if (syncModelRoutingCron == null) { - startSyncModelRoutingCron(); - } + ActionListener listener = ActionListener.wrap(r -> { + if (syncModelRoutingCron == null) { + startSyncModelRoutingCron(); + } + }, e -> { + if (syncModelRoutingCron == null) { + startSyncModelRoutingCron(); + } + }); + mlModelAutoReDeployer.setStartCronJobListener(listener); + String localNodeId = clusterService.localNode().getId(); + threadPool + .schedule( + () -> mlModelAutoReDeployer.buildAutoReloadArrangement(List.of(localNodeId), localNodeId), + TimeValue.timeValueSeconds(jobInterval), + GENERAL_THREAD_POOL + ); } private void startSyncModelRoutingCron() { if (jobInterval > 0) { + log.info("Starting ML sync up job..."); syncModelRoutingCron = threadPool .scheduleWithFixedDelay( new MLSyncUpCron(client, clusterService, nodeHelper, mlIndicesHandler, encryptor), diff --git a/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java b/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java index 5a9394f0ae..fb469b3354 100644 --- a/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java +++ b/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java @@ -596,7 +596,8 @@ public Collection createComponents( threadPool, nodeHelper, mlIndicesHandler, - encryptor + encryptor, + mlModelAutoRedeployer ); // TODO move this into MLFeatureEnabledSetting @@ -845,6 +846,7 @@ public List> getSettings() { MLCommonsSettings.ML_COMMONS_ENABLE_INHOUSE_PYTHON_MODEL, MLCommonsSettings.ML_COMMONS_MODEL_AUTO_REDEPLOY_ENABLE, MLCommonsSettings.ML_COMMONS_MODEL_AUTO_REDEPLOY_LIFETIME_RETRY_TIMES, + MLCommonsSettings.ML_COMMONS_MODEL_AUTO_REDEPLOY_SUCCESS_RATIO, MLCommonsSettings.ML_COMMONS_ALLOW_MODEL_URL, MLCommonsSettings.ML_COMMONS_ALLOW_LOCAL_FILE_UPLOAD, MLCommonsSettings.ML_COMMONS_MODEL_ACCESS_CONTROL_ENABLED,