Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ScaleSetPriority for AzureManagedMachinePool #2604

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions azure/converters/managedagentpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ func AgentPoolToManagedClusterAgentPoolProfile(pool containerservice.AgentPool)
NodeLabels: properties.NodeLabels,
EnableUltraSSD: properties.EnableUltraSSD,
EnableNodePublicIP: properties.EnableNodePublicIP,
ScaleSetPriority: properties.ScaleSetPriority,
}
}
1 change: 1 addition & 0 deletions azure/scope/managedmachinepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ func buildAgentPoolSpec(managedControlPlane *infrav1exp.AzureManagedControlPlane
EnableUltraSSD: managedMachinePool.Spec.EnableUltraSSD,
Headers: maps.FilterByKeyPrefix(agentPoolAnnotations, azure.CustomHeaderPrefix),
EnableNodePublicIP: managedMachinePool.Spec.EnableNodePublicIP,
ScaleSetPriority: managedMachinePool.Spec.ScaleSetPriority,
}

if managedMachinePool.Spec.OSDiskSizeGB != nil {
Expand Down
27 changes: 26 additions & 1 deletion azure/services/agentpools/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/pkg/errors"
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/azure"
azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
)

// AgentPoolSpec contains agent pool specification details.
Expand Down Expand Up @@ -92,6 +93,9 @@ type AgentPoolSpec struct {

// EnableNodePublicIP controls whether or not nodes in the agent pool each have a public IP address.
EnableNodePublicIP *bool `json:"enableNodePublicIP,omitempty"`

// ScaleSetPriority specifies the ScaleSetPriority for the node pool. Allowed values are 'Spot' and 'Regular'
CecileRobertMichon marked this conversation as resolved.
Show resolved Hide resolved
ScaleSetPriority *string `json:"scaleSetPriority,omitempty"`
}

// ResourceName returns the name of the agent pool.
Expand All @@ -116,6 +120,7 @@ func (s *AgentPoolSpec) CustomHeaders() map[string]string {

// Parameters returns the parameters for the agent pool.
func (s *AgentPoolSpec) Parameters(existing interface{}) (params interface{}, err error) {
nodeLabels := s.NodeLabels
if existing != nil {
existingPool, ok := existing.(containerservice.AgentPool)
if !ok {
Expand Down Expand Up @@ -169,6 +174,12 @@ func (s *AgentPoolSpec) Parameters(existing interface{}) (params interface{}, er
// agent pool is up to date, nothing to do
return nil, nil
}
// We do a just-in-time merge of existent kubernetes.azure.com-prefixed labels
// So that we don't unintentionally delete them
// See https://github.com/Azure/AKS/issues/3152
if normalizedProfile.NodeLabels != nil {
nodeLabels = mergeSystemNodeLabels(normalizedProfile.NodeLabels, existingPool.NodeLabels)
}
}

var availabilityZones *[]string
Expand Down Expand Up @@ -202,16 +213,30 @@ func (s *AgentPoolSpec) Parameters(existing interface{}) (params interface{}, er
MaxPods: s.MaxPods,
MinCount: s.MinCount,
Mode: containerservice.AgentPoolMode(s.Mode),
NodeLabels: s.NodeLabels,
NodeLabels: nodeLabels,
NodeTaints: nodeTaints,
OrchestratorVersion: s.Version,
OsDiskSizeGB: &s.OSDiskSizeGB,
OsDiskType: containerservice.OSDiskType(to.String(s.OsDiskType)),
OsType: containerservice.OSType(to.String(s.OSType)),
ScaleSetPriority: containerservice.ScaleSetPriority(to.String(s.ScaleSetPriority)),
Type: containerservice.AgentPoolTypeVirtualMachineScaleSets,
VMSize: sku,
VnetSubnetID: vnetSubnetID,
EnableNodePublicIP: s.EnableNodePublicIP,
},
}, nil
}

// mergeSystemNodeLabels appends any kubernetes.azure.com-prefixed labels from the AKS label set
// into the local capz label set.
func mergeSystemNodeLabels(capz, aks map[string]*string) map[string]*string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this intentionally drop any labels that are not system labels (ie. no kubernetes.azure.com prefix) but aren't part of the CAPZ spec? for example if they were added by some external process

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I think the idea here is that capz is the "exclusive" maintainer of labels, and the update interface has been defined such that every time you push a labels change you need to include the entire set of labels. Ref:

https://learn.microsoft.com/en-us/azure/aks/use-labels#updating-labels-on-existing-node-pools

The think about the kubernetes.azure.com-prefixed labels is (IMO) and AKS bug, and hopefully we can drop this foo at some point in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ret := capz
// Look for labels returned from the AKS node pool API that begin with kubernetes.azure.com
for aksNodeLabelKey := range aks {
if azureutil.IsAzureSystemNodeLabelKey(aksNodeLabelKey) {
ret[aksNodeLabelKey] = aks[aksNodeLabelKey]
}
}
return ret
}
79 changes: 79 additions & 0 deletions azure/services/agentpools/spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,82 @@ func TestParameters(t *testing.T) {
})
}
}

func TestMergeSystemNodeLabels(t *testing.T) {
testcases := []struct {
name string
capzLabels map[string]*string
aksLabels map[string]*string
expected map[string]*string
}{
{
name: "update an existing label",
capzLabels: map[string]*string{
"foo": to.StringPtr("bar"),
},
aksLabels: map[string]*string{
"foo": to.StringPtr("baz"),
},
expected: map[string]*string{
"foo": to.StringPtr("bar"),
},
},
{
name: "delete labels",
capzLabels: map[string]*string{},
aksLabels: map[string]*string{
"foo": to.StringPtr("bar"),
"hello": to.StringPtr("world"),
},
expected: map[string]*string{},
},
{
name: "delete one label",
capzLabels: map[string]*string{
"foo": to.StringPtr("bar"),
},
aksLabels: map[string]*string{
"foo": to.StringPtr("bar"),
"hello": to.StringPtr("world"),
},
expected: map[string]*string{
"foo": to.StringPtr("bar"),
},
},
{
name: "retain system label during update",
capzLabels: map[string]*string{
"foo": to.StringPtr("bar"),
},
aksLabels: map[string]*string{
"kubernetes.azure.com/scalesetpriority": to.StringPtr("spot"),
},
expected: map[string]*string{
"foo": to.StringPtr("bar"),
"kubernetes.azure.com/scalesetpriority": to.StringPtr("spot"),
},
},
{
name: "retain system label during delete",
capzLabels: map[string]*string{},
aksLabels: map[string]*string{
"kubernetes.azure.com/scalesetpriority": to.StringPtr("spot"),
},
expected: map[string]*string{
"kubernetes.azure.com/scalesetpriority": to.StringPtr("spot"),
},
},
}

for _, tc := range testcases {
t.Logf("Testing " + tc.name)
tc := tc
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)
t.Parallel()

ret := mergeSystemNodeLabels(tc.capzLabels, tc.aksLabels)
g.Expect(ret).To(Equal(tc.expected))
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,13 @@ spec:
items:
type: string
type: array
scaleSetPriority:
description: 'ScaleSetPriority specifies the ScaleSetPriority value.
Default to Regular. Possible values include: ''Regular'', ''Spot'''
enum:
- Regular
- Spot
type: string
scaling:
description: Scaling specifies the autoscaling parameters for the
node pool.
Expand Down
1 change: 1 addition & 0 deletions exp/api/v1alpha3/azuremanagedmachinepool_conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func (src *AzureManagedMachinePool) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.NodeLabels = restored.Spec.NodeLabels
dst.Spec.EnableUltraSSD = restored.Spec.EnableUltraSSD
dst.Spec.EnableNodePublicIP = restored.Spec.EnableNodePublicIP
dst.Spec.ScaleSetPriority = restored.Spec.ScaleSetPriority

dst.Status.LongRunningOperationStates = restored.Status.LongRunningOperationStates
dst.Status.Conditions = restored.Status.Conditions
Expand Down
1 change: 1 addition & 0 deletions exp/api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions exp/api/v1alpha4/azuremanagedmachinepool_conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func (src *AzureManagedMachinePool) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.NodeLabels = restored.Spec.NodeLabels
dst.Spec.EnableUltraSSD = restored.Spec.EnableUltraSSD
dst.Spec.EnableNodePublicIP = restored.Spec.EnableNodePublicIP
dst.Spec.ScaleSetPriority = restored.Spec.ScaleSetPriority

dst.Status.LongRunningOperationStates = restored.Status.LongRunningOperationStates
dst.Status.Conditions = restored.Status.Conditions
Expand Down
1 change: 1 addition & 0 deletions exp/api/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions exp/api/v1beta1/azuremanagedmachinepool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ type AzureManagedMachinePoolSpec struct {
// EnableNodePublicIP controls whether or not nodes in the pool each have a public IP address.
// +optional
EnableNodePublicIP *bool `json:"enableNodePublicIP,omitempty"`

// ScaleSetPriority specifies the ScaleSetPriority value. Default to Regular. Possible values include: 'Regular', 'Spot'
// +kubebuilder:validation:Enum=Regular;Spot
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jackfrancis if you send the Regular value. AKS isn't setting it to Regular but returns the empty.

Later any other change to nodepool will fail as Azure API returns the error that it cannot change from "" to Regular

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't able to reproduce this with labels changes to the node pool. Which node pool changes were you seeing this on when ScaleSetPriority was set to "Regular"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again I did some more manual tests and this looks fine. Creating a cluster with an explicit scaleSetPriority: Regular AzureManagedMachinePool configuration does not have any apparent negative side-effects.

I think this PR is ready to gol

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still doesn't work for me.

steps to reproduce:

  1. create an agentpool with explicit ScaleSetPriority set to Regular
  2. verify in az aks nodepool show --cluster-name cluster-name -n nodepool-name that it has "scaleSetPriority": null,
  3. try to change a label in agentpool and see reconcile error Message=\"Changing property 'properties.ScaleSetPriority' is not allowed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you testing using a build from this branch? In this current PR implementation, I definitely can't reproduce. I do observe what you're seeing from the AKS API:

$ az aks nodepool show --cluster-name capz-e2e-l9d26h-aks -n pool1 -g capz-e2e-l9d26h-aks | grep 'scaleSetPriority'
  "scaleSetPriority": null,

However, I'm able to continually update the node pool using capz (using node labels below as an example):

$ k get azuremanagedmachinepool/pool1 -n capz-e2e-l9d26h -o yaml | grep -A 5 '  nodeLabels'
  nodeLabels:
    foo: bar
  osDiskSizeGB: 40
  osDiskType: Ephemeral
  osType: Linux
  providerIDList:

You can see I've already applied a foo: bar node label. I'll now add a 2nd label:

$ k edit azuremanagedmachinepool/pool1 -n capz-e2e-l9d26h
azuremanagedmachinepool.infrastructure.cluster.x-k8s.io/pool1 edited
$ k get azuremanagedmachinepool/pool1 -n capz-e2e-l9d26h -o yaml | grep -A 5 '  nodeLabels'
  nodeLabels:
    foo: bar
    hello: world
  osDiskSizeGB: 40
  osDiskType: Ephemeral
  osType: Linux

Now if I get the node pool data from the AKS API I see the updated label:

$ az aks nodepool show --cluster-name capz-e2e-l9d26h-aks -n pool1 -g capz-e2e-l9d26h-aks | grep -A 5 'nodeLabels'
  "nodeLabels": {
    "foo": "bar",
    "hello": "world"
  },
  "nodePublicIpPrefixId": null,
  "nodeTaints": [

As expected, ScaleSetPriority is unchanged from either source:

$ az aks nodepool show --cluster-name capz-e2e-l9d26h-aks -n pool1 -g capz-e2e-l9d26h-aks | grep 'scaleSetPriority'
  "scaleSetPriority": null,
$ k get azuremanagedmachinepool/pool1 -n capz-e2e-l9d26h -o yaml | grep '  scaleSetPriority'
  scaleSetPriority: Regular

I think this PR has overcome this particular pathology. We should ship add'l E2E tests with this PR to prove it, but I'm confident we're good here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

definitely its working now, does Azure API changed something?
in any case I agree to merge this

// +optional
ScaleSetPriority *string `json:"scaleSetPriority,omitempty"`
}

// ManagedMachinePoolScaling specifies scaling options.
Expand Down
30 changes: 30 additions & 0 deletions exp/api/v1beta1/azuremanagedmachinepool_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/validation/field"
"sigs.k8s.io/cluster-api-provider-azure/azure"
azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
"sigs.k8s.io/cluster-api-provider-azure/util/maps"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -59,6 +60,7 @@ func (m *AzureManagedMachinePool) ValidateCreate(client client.Client) error {
m.validateMaxPods,
m.validateOSType,
m.validateName,
m.validateNodeLabels,
}

var errs []error
Expand All @@ -76,6 +78,14 @@ func (m *AzureManagedMachinePool) ValidateUpdate(oldRaw runtime.Object, client c
old := oldRaw.(*AzureManagedMachinePool)
var allErrs field.ErrorList

if err := m.validateNodeLabels(); err != nil {
allErrs = append(allErrs,
field.Invalid(
field.NewPath("Spec", "NodeLabels"),
m.Spec.NodeLabels,
err.Error()))
}

if old.Spec.OSType != nil {
// Prevent OSType modification if it was already set to some value
if m.Spec.OSType == nil {
Expand Down Expand Up @@ -189,6 +199,13 @@ func (m *AzureManagedMachinePool) ValidateUpdate(oldRaw runtime.Object, client c
}
}

if !reflect.DeepEqual(m.Spec.ScaleSetPriority, old.Spec.ScaleSetPriority) {
allErrs = append(allErrs,
field.Invalid(field.NewPath("Spec", "ScaleSetPriority"),
m.Spec.ScaleSetPriority, "field is immutable"),
)
}

if err := validateBoolPtrImmutable(
field.NewPath("Spec", "EnableUltraSSD"),
old.Spec.EnableUltraSSD,
Expand Down Expand Up @@ -298,6 +315,19 @@ func (m *AzureManagedMachinePool) validateName() error {
return nil
}

func (m *AzureManagedMachinePool) validateNodeLabels() error {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's possible to do this validation in the kubebuilder API type spec (e.g., using the Pattern= interface to define a required regular expression match) because the API type is a map[string]string.

for key := range m.Spec.NodeLabels {
if azureutil.IsAzureSystemNodeLabelKey(key) {
return field.Invalid(
field.NewPath("Spec", "NodeLabels"),
key,
fmt.Sprintf("Node pool label key must not start with %s", azureutil.AzureSystemNodeLabelPrefix))
}
}

return nil
}

func ensureStringSlicesAreEqual(a []string, b []string) bool {
if len(a) != len(b) {
return false
Expand Down
48 changes: 48 additions & 0 deletions exp/api/v1beta1/azuremanagedmachinepool_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,10 +462,31 @@ func TestAzureManagedMachinePoolUpdatingWebhook(t *testing.T) {
},
wantErr: false,
},
{
name: "Can't add a node label that begins with kubernetes.azure.com",
new: &AzureManagedMachinePool{
Spec: AzureManagedMachinePoolSpec{
NodeLabels: map[string]string{
"foo": "bar",
"kubernetes.azure.com/scalesetpriority": "spot",
},
},
},
old: &AzureManagedMachinePool{
Spec: AzureManagedMachinePoolSpec{
NodeLabels: map[string]string{
"foo": "bar",
},
},
},
wantErr: true,
},
}
var client client.Client
for _, tc := range tests {
tc := tc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I don't think doing this is necessary here because we're never taking the address of tc in the subtest (like with &tc). Certainly doesn't hurt to keep it here though in case this test needs to do that in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two points:

  1. It's correct that we do this as a best-practice to avoid the non-deterministic UT outcomes that result from not doing this when we need to.
  2. Doing it all the time as a rote practice tends to make the developer forget the exact edge case root cause that makes this pattern a necessary thing and it becomes a kind of incantation.

🤷

t.Run(tc.name, func(t *testing.T) {
t.Parallel()
err := tc.new.ValidateUpdate(tc.old, client)
if tc.wantErr {
g.Expect(err).To(HaveOccurred())
Expand Down Expand Up @@ -650,6 +671,33 @@ func TestAzureManagedMachinePool_ValidateCreate(t *testing.T) {
wantErr: true,
errorLen: 1,
},
{
name: "valid label",
ammp: &AzureManagedMachinePool{
Spec: AzureManagedMachinePoolSpec{
Mode: "User",
OSType: to.StringPtr(azure.LinuxOS),
NodeLabels: map[string]string{
"foo": "bar",
},
},
},
wantErr: false,
},
{
name: "kubernetes.azure.com label",
ammp: &AzureManagedMachinePool{
Spec: AzureManagedMachinePoolSpec{
Mode: "User",
OSType: to.StringPtr(azure.LinuxOS),
NodeLabels: map[string]string{
"kubernetes.azure.com/scalesetpriority": "spot",
},
},
},
wantErr: true,
errorLen: 1,
},
}
var client client.Client
for _, tc := range tests {
Expand Down
5 changes: 5 additions & 0 deletions exp/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions templates/cluster-template-aks.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions templates/flavors/aks/cluster-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,4 @@ spec:
osDiskSizeGB: 40
sku: "${AZURE_NODE_MACHINE_TYPE}"
enableNodePublicIP: false
scaleSetPriority: Regular
1 change: 1 addition & 0 deletions templates/test/ci/cluster-template-prow-aks.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading