From cd2a8845cd6dc918d40f11a027e1525f2686f037 Mon Sep 17 00:00:00 2001 From: David Kyle Date: Wed, 23 Oct 2024 23:17:24 +0100 Subject: [PATCH] [ML] Fix NPE in Get Deployment Stats (#115404) (#115474) If a node has been removed from the cluster and the trained model assignment has not been updated the GET stats action can have an inconsistent view where it thinks a model is deployed on the removed node. The bug only affected nodes with failed deployments. --- docs/changelog/115404.yaml | 5 +++ .../TransportGetDeploymentStatsAction.java | 2 +- ...ransportGetDeploymentStatsActionTests.java | 39 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/115404.yaml diff --git a/docs/changelog/115404.yaml b/docs/changelog/115404.yaml new file mode 100644 index 0000000000000..e443b152955f3 --- /dev/null +++ b/docs/changelog/115404.yaml @@ -0,0 +1,5 @@ +pr: 115404 +summary: Fix NPE in Get Deployment Stats +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java index 980cdc09252cb..9ebc510af4f4d 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java @@ -220,7 +220,7 @@ static GetDeploymentStatsAction.Response addFailedRoutes( // add nodes from the failures that were not in the task responses for (var nodeRoutingState : nodeToRoutingStates.entrySet()) { - if (visitedNodes.contains(nodeRoutingState.getKey()) == false) { + if ((visitedNodes.contains(nodeRoutingState.getKey()) == false) && nodes.nodeExists(nodeRoutingState.getKey())) { updatedNodeStats.add( AssignmentStats.NodeStats.forNotStartedState( nodes.get(nodeRoutingState.getKey()), diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java index 4a66be4a773f5..2490cd8d5ab21 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java @@ -148,6 +148,45 @@ public void testAddFailedRoutes_TaskResultIsOverwritten() throws UnknownHostExce assertEquals(RoutingState.FAILED, results.get(0).getNodeStats().get(1).getRoutingState().getState()); } + public void testAddFailedRoutes_MissingNode() throws UnknownHostException { + DiscoveryNodes nodes = buildNodes("node1", "node2"); + var missingNode = DiscoveryNodeUtils.create( + "node3", + new TransportAddress(InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 }), 9203) + ); + + List nodeStatsList = new ArrayList<>(); + nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node1"))); + nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node2"))); + + var model1 = new AssignmentStats( + "model1", + "deployment1", + randomBoolean() ? null : randomIntBetween(1, 8), + randomBoolean() ? null : randomIntBetween(1, 8), + null, + randomBoolean() ? null : randomIntBetween(1, 10000), + randomBoolean() ? null : ByteSizeValue.ofBytes(randomLongBetween(1, 1000000)), + Instant.now(), + nodeStatsList, + randomFrom(Priority.values()) + ); + var response = new GetDeploymentStatsAction.Response(Collections.emptyList(), Collections.emptyList(), List.of(model1), 1); + + // failed state for node 3 conflicts + Map> badRoutes = new HashMap<>(); + Map nodeRoutes = new HashMap<>(); + nodeRoutes.put("node3", new RoutingInfo(1, 1, RoutingState.FAILED, "failed on node3")); + badRoutes.put(createAssignment("model1"), nodeRoutes); + + var modified = TransportGetDeploymentStatsAction.addFailedRoutes(response, badRoutes, nodes); + List results = modified.getStats().results(); + assertThat(results, hasSize(1)); + assertThat(results.get(0).getNodeStats(), hasSize(2)); // 3 + assertEquals("node1", results.get(0).getNodeStats().get(0).getNode().getId()); + assertEquals("node2", results.get(0).getNodeStats().get(1).getNode().getId()); + } + private DiscoveryNodes buildNodes(String... nodeIds) throws UnknownHostException { InetAddress inetAddress = InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 }); DiscoveryNodes.Builder builder = DiscoveryNodes.builder();