Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.3] [ML] Rename trained model metadata after full cluster upgrade (#87806) #87814

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.elasticsearch.ElasticsearchStatusException;
import org.elasticsearch.ResourceAlreadyExistsException;
import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.cluster.ClusterChangedEvent;
Expand Down Expand Up @@ -58,6 +59,8 @@ public class TrainedModelAssignmentClusterService implements ClusterStateListene

private static final Logger logger = LogManager.getLogger(TrainedModelAssignmentClusterService.class);

private static final Version RENAME_ALLOCATION_TO_ASSIGNMENT_VERSION = Version.V_8_3_0;

private final ClusterService clusterService;
private final NodeLoadDetector nodeLoadDetector;
private volatile int maxMemoryPercentage;
Expand Down Expand Up @@ -246,18 +249,23 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState)

private static ClusterState update(ClusterState currentState, TrainedModelAssignmentMetadata.Builder modelAssignments) {
if (modelAssignments.isChanged()) {
return ClusterState.builder(currentState)
.metadata(
Metadata.builder(currentState.metadata())
.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build())
.removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME)
)
.build();
return forceUpdate(currentState, modelAssignments);
} else {
return currentState;
}
}

private static ClusterState forceUpdate(ClusterState currentState, TrainedModelAssignmentMetadata.Builder modelAssignments) {
Metadata.Builder metadata = Metadata.builder(currentState.metadata());
if (currentState.getNodes().getMinNodeVersion().onOrAfter(RENAME_ALLOCATION_TO_ASSIGNMENT_VERSION)) {
metadata.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build())
.removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME);
} else {
metadata.putCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME, modelAssignments.build());
}
return ClusterState.builder(currentState).metadata(metadata).build();
}

ClusterState createModelAssignment(ClusterState currentState, StartTrainedModelDeploymentAction.TaskParams params) {
if (MlMetadata.getMlMetadata(currentState).isResetMode()) {
throw new ElasticsearchStatusException(
Expand Down Expand Up @@ -362,14 +370,7 @@ static ClusterState removeAllAssignments(ClusterState currentState) {
if (TrainedModelAssignmentMetadata.fromState(currentState).modelAssignments().isEmpty()) {
return currentState;
}
return ClusterState.builder(currentState)
.metadata(
Metadata.builder(currentState.metadata())
.putCustom(TrainedModelAssignmentMetadata.NAME, TrainedModelAssignmentMetadata.Builder.empty().build())
.removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME)
.build()
)
.build();
return forceUpdate(currentState, TrainedModelAssignmentMetadata.Builder.empty());
}

ClusterState addRemoveAssignmentNodes(ClusterState currentState) {
Expand Down Expand Up @@ -438,8 +439,8 @@ ClusterState addRemoveAssignmentNodes(ClusterState currentState) {

static boolean shouldAllocateModels(final ClusterChangedEvent event) {
// If there are no assignments created at all, there is nothing to update
final TrainedModelAssignmentMetadata newMetadata = event.state().getMetadata().custom(TrainedModelAssignmentMetadata.NAME);
if (newMetadata == null) {
final TrainedModelAssignmentMetadata newMetadata = TrainedModelAssignmentMetadata.fromState(event.state());
if (newMetadata == null || newMetadata.modelAssignments().isEmpty()) {
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ public void testRemoveAssignment() {
);

ClusterState clusterStateWithAssignment = ClusterState.builder(new ClusterName("testRemoveAssignment"))
.nodes(DiscoveryNodes.builder().add(buildNode("test-node", true, ByteSizeValue.ofGb(4).getBytes())).build())
.metadata(
Metadata.builder()
.putCustom(
Expand Down Expand Up @@ -227,6 +228,7 @@ public void testRemoveAllAssignments() {
);

ClusterState clusterStateWithAssignments = ClusterState.builder(new ClusterName("testRemoveAllAssignments"))
.nodes(DiscoveryNodes.builder().add(buildNode("test-node", true, ByteSizeValue.ofGb(4).getBytes())).build())
.metadata(
Metadata.builder()
.putCustom(TrainedModelAssignmentMetadata.NAME, TrainedModelAssignmentMetadataTests.randomInstance())
Expand Down Expand Up @@ -304,7 +306,8 @@ public void testAddRemoveAssignmentNodes() {
Metadata.builder()
.putCustom(NodesShutdownMetadata.TYPE, shutdownMetadata("ml-node-shutting-down"))
.putCustom(
TrainedModelAssignmentMetadata.NAME,
// We have to use deprecated name here as we have a node versioned before the rename
TrainedModelAssignmentMetadata.DEPRECATED_NAME,
TrainedModelAssignmentMetadata.Builder.empty()
.addNewAssignment(
"model-1",
Expand Down Expand Up @@ -373,7 +376,8 @@ public void testAddRemoveAllocationNodesPrioritizesAllocationsWithFewerNodes() {
Metadata.builder()
.putCustom(NodesShutdownMetadata.TYPE, shutdownMetadata("ml-node-shutting-down"))
.putCustom(
TrainedModelAssignmentMetadata.NAME,
// We have to use deprecated name here as we have a node versioned before the rename
TrainedModelAssignmentMetadata.DEPRECATED_NAME,
TrainedModelAssignmentMetadata.Builder.empty()
.addNewAssignment(
"model-1",
Expand Down Expand Up @@ -1035,6 +1039,7 @@ public void testSetAllocationToStopping() {
);

ClusterState clusterStateWithAllocation = ClusterState.builder(new ClusterName("testSetAllocationToStopping"))
.nodes(DiscoveryNodes.builder().add(buildNode("test-node", true, ByteSizeValue.ofGb(4).getBytes())).build())
.metadata(
Metadata.builder()
.putCustom(
Expand Down