From 231452e7b11f54f8cc361a471230e542e170928b Mon Sep 17 00:00:00 2001 From: Ryan Bogan Date: Thu, 18 Apr 2024 12:23:37 -0700 Subject: [PATCH] Refactor cluster state listener transport calls Signed-off-by: Ryan Bogan --- .../TrainingJobClusterStateListener.java | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/knn/training/TrainingJobClusterStateListener.java b/src/main/java/org/opensearch/knn/training/TrainingJobClusterStateListener.java index cd2f1b99d..7e39ff7b3 100644 --- a/src/main/java/org/opensearch/knn/training/TrainingJobClusterStateListener.java +++ b/src/main/java/org/opensearch/knn/training/TrainingJobClusterStateListener.java @@ -109,8 +109,7 @@ protected void updateModelsNewCluster() throws IOException, InterruptedException if (modelDao.isCreated()) { List modelIds = searchModelIds(); for (String modelId : modelIds) { - Model model = modelDao.get(modelId); - ModelMetadata modelMetadata = model.getModelMetadata(); + ModelMetadata modelMetadata = getModelMetadata(modelId); if (modelMetadata.getState().equals(ModelState.TRAINING)) { updateModelStateAsFailed(modelId, modelMetadata, "Training failed to complete as cluster crashed"); } @@ -123,7 +122,7 @@ protected void updateModelsNodesRemoved(List removedNodes) throws List modelIds = searchModelIds(); for (DiscoveryNode removedNode : removedNodes) { for (String modelId : modelIds) { - ModelMetadata modelMetadata = modelDao.getMetadata(modelId); + ModelMetadata modelMetadata = getModelMetadata(modelId); if (modelMetadata.getNodeAssignment().equals(removedNode.getEphemeralId()) && modelMetadata.getState().equals(ModelState.TRAINING)) { updateModelStateAsFailed(modelId, modelMetadata, "Training failed to complete as node dropped"); @@ -174,4 +173,17 @@ public void onFailure(Exception e) { } }); } + + private ModelMetadata getModelMetadata(String modelId) throws ExecutionException, InterruptedException { + ModelMetadata modelMetadata = modelDao.getMetadata(modelId); + // On versions prior to 2.14, only models in created state are present in model metadata. + if (modelMetadata == null) { + log.info( + "Model metadata is null in cluster metadata. This can happen for models training on nodes prior to OpenSearch version 2.14.0. Fetching model information from system index." + ); + Model model = modelDao.get(modelId); + return model.getModelMetadata(); + } + return modelMetadata; + } }