Skip to content

Commit

Permalink
[ML] Fix NPE in Get Deployment Stats (#115404) (#115474)
Browse files Browse the repository at this point in the history
If a node has been removed from the cluster and the trained model 
assignment has not been updated the GET stats action can have an
inconsistent view where it thinks a model is deployed on the removed
node. The bug only affected nodes with failed deployments.
  • Loading branch information
davidkyle authored Oct 23, 2024
1 parent 548917a commit cd2a884
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/changelog/115404.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 115404
summary: Fix NPE in Get Deployment Stats
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ static GetDeploymentStatsAction.Response addFailedRoutes(

// add nodes from the failures that were not in the task responses
for (var nodeRoutingState : nodeToRoutingStates.entrySet()) {
if (visitedNodes.contains(nodeRoutingState.getKey()) == false) {
if ((visitedNodes.contains(nodeRoutingState.getKey()) == false) && nodes.nodeExists(nodeRoutingState.getKey())) {
updatedNodeStats.add(
AssignmentStats.NodeStats.forNotStartedState(
nodes.get(nodeRoutingState.getKey()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,45 @@ public void testAddFailedRoutes_TaskResultIsOverwritten() throws UnknownHostExce
assertEquals(RoutingState.FAILED, results.get(0).getNodeStats().get(1).getRoutingState().getState());
}

public void testAddFailedRoutes_MissingNode() throws UnknownHostException {
DiscoveryNodes nodes = buildNodes("node1", "node2");
var missingNode = DiscoveryNodeUtils.create(
"node3",
new TransportAddress(InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 }), 9203)
);

List<AssignmentStats.NodeStats> nodeStatsList = new ArrayList<>();
nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node1")));
nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node2")));

var model1 = new AssignmentStats(
"model1",
"deployment1",
randomBoolean() ? null : randomIntBetween(1, 8),
randomBoolean() ? null : randomIntBetween(1, 8),
null,
randomBoolean() ? null : randomIntBetween(1, 10000),
randomBoolean() ? null : ByteSizeValue.ofBytes(randomLongBetween(1, 1000000)),
Instant.now(),
nodeStatsList,
randomFrom(Priority.values())
);
var response = new GetDeploymentStatsAction.Response(Collections.emptyList(), Collections.emptyList(), List.of(model1), 1);

// failed state for node 3 conflicts
Map<TrainedModelAssignment, Map<String, RoutingInfo>> badRoutes = new HashMap<>();
Map<String, RoutingInfo> nodeRoutes = new HashMap<>();
nodeRoutes.put("node3", new RoutingInfo(1, 1, RoutingState.FAILED, "failed on node3"));
badRoutes.put(createAssignment("model1"), nodeRoutes);

var modified = TransportGetDeploymentStatsAction.addFailedRoutes(response, badRoutes, nodes);
List<AssignmentStats> results = modified.getStats().results();
assertThat(results, hasSize(1));
assertThat(results.get(0).getNodeStats(), hasSize(2)); // 3
assertEquals("node1", results.get(0).getNodeStats().get(0).getNode().getId());
assertEquals("node2", results.get(0).getNodeStats().get(1).getNode().getId());
}

private DiscoveryNodes buildNodes(String... nodeIds) throws UnknownHostException {
InetAddress inetAddress = InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 });
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
Expand Down

0 comments on commit cd2a884

Please sign in to comment.