diff --git a/api/v1/runtimecomponent_types.go b/api/v1/runtimecomponent_types.go index 75ac98a2..cd05e1f9 100644 --- a/api/v1/runtimecomponent_types.go +++ b/api/v1/runtimecomponent_types.go @@ -146,6 +146,20 @@ type RuntimeComponentSpec struct { // Security context for the application container. // +operator-sdk:csv:customresourcedefinitions:order=25,type=spec,displayName="Security Context" SecurityContext *corev1.SecurityContext `json:"securityContext,omitempty"` + + // +operator-sdk:csv:customresourcedefinitions:order=26,type=spec,displayName="Topology Spread Constraints" + TopologySpreadConstraints *RuntimeComponentTopologySpreadConstraints `json:"topologySpreadConstraints,omitempty"` +} + +// Defines the topology spread constraints +type RuntimeComponentTopologySpreadConstraints struct { + // The list of TopologySpreadConstraints for the application pod. + // +operator-sdk:csv:customresourcedefinitions:order=1,type=spec,displayName="Constraints" + Constraints *[]corev1.TopologySpreadConstraint `json:"constraints,omitempty"` + + // Whether the operator should disable its default set of TopologySpreadConstraints. Defaults to false. + // +operator-sdk:csv:customresourcedefinitions:order=1,type=spec,displayName="Disable Operator Defaults",xDescriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + DisableOperatorDefaults *bool `json:"disableOperatorDefaults,omitempty"` } // Defines the service account @@ -917,6 +931,25 @@ func (cr *RuntimeComponent) GetSecurityContext() *corev1.SecurityContext { return cr.Spec.SecurityContext } +// GetTopologySpreadConstraints returns the pod topology spread constraints configuration +func (cr *RuntimeComponent) GetTopologySpreadConstraints() common.BaseComponentTopologySpreadConstraints { + if cr.Spec.TopologySpreadConstraints == nil { + return nil + } + return cr.Spec.TopologySpreadConstraints +} + +func (cr *RuntimeComponentTopologySpreadConstraints) GetConstraints() *[]corev1.TopologySpreadConstraint { + if cr.Constraints == nil { + return nil + } + return cr.Constraints +} + +func (cr *RuntimeComponentTopologySpreadConstraints) GetDisableOperatorDefaults() *bool { + return cr.DisableOperatorDefaults +} + // Initialize the RuntimeComponent instance func (cr *RuntimeComponent) Initialize() { if cr.Spec.PullPolicy == nil { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 7730669f..8c2f7381 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -587,6 +587,11 @@ func (in *RuntimeComponentSpec) DeepCopyInto(out *RuntimeComponentSpec) { *out = new(corev1.SecurityContext) (*in).DeepCopyInto(*out) } + if in.TopologySpreadConstraints != nil { + in, out := &in.TopologySpreadConstraints, &out.TopologySpreadConstraints + *out = new(RuntimeComponentTopologySpreadConstraints) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeComponentSpec. @@ -691,6 +696,37 @@ func (in *RuntimeComponentStorage) DeepCopy() *RuntimeComponentStorage { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RuntimeComponentTopologySpreadConstraints) DeepCopyInto(out *RuntimeComponentTopologySpreadConstraints) { + *out = *in + if in.Constraints != nil { + in, out := &in.Constraints, &out.Constraints + *out = new([]corev1.TopologySpreadConstraint) + if **in != nil { + in, out := *in, *out + *out = make([]corev1.TopologySpreadConstraint, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + } + if in.DisableOperatorDefaults != nil { + in, out := &in.DisableOperatorDefaults, &out.DisableOperatorDefaults + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeComponentTopologySpreadConstraints. +func (in *RuntimeComponentTopologySpreadConstraints) DeepCopy() *RuntimeComponentTopologySpreadConstraints { + if in == nil { + return nil + } + out := new(RuntimeComponentTopologySpreadConstraints) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RuntimeOperation) DeepCopyInto(out *RuntimeOperation) { *out = *in diff --git a/bundle/manifests/rc.app.stacks_runtimecomponents.yaml b/bundle/manifests/rc.app.stacks_runtimecomponents.yaml index 4227ac33..cfb0ed83 100644 --- a/bundle/manifests/rc.app.stacks_runtimecomponents.yaml +++ b/bundle/manifests/rc.app.stacks_runtimecomponents.yaml @@ -5428,6 +5428,189 @@ spec: type: string type: object type: object + topologySpreadConstraints: + description: Defines the topology spread constraints + properties: + constraints: + description: The list of TopologySpreadConstraints for the application + pod. + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine + the number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: MatchLabelKeys is a set of pod label keys to + select the pods over which spreading will be calculated. + The keys are used to lookup values from the incoming pod + labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading + will be calculated for the incoming pod. Keys that don't + exist in the incoming pod labels will be ignored. A null + or empty list means only match against labelSelector. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: 'MaxSkew describes the degree to which pods + may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference between the number + of matching pods in the target topology and the global + minimum. The global minimum is the minimum number of matching + pods in an eligible domain or zero if the number of eligible + domains is less than MinDomains. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 2/2/1: In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | | P P | P P | P | - + if MaxSkew is 1, incoming pod can only be scheduled to + zone3 to become 2/2/2; scheduling it onto zone1(zone2) + would make the ActualSkew(3-1) on zone1(zone2) violate + MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled + onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to topologies that + satisfy it. It''s a required field. Default value is 1 + and 0 is not allowed.' + format: int32 + type: integer + minDomains: + description: "MinDomains indicates a minimum number of eligible + domains. When the number of eligible domains with matching + topology keys is less than minDomains, Pod Topology Spread + treats \"global minimum\" as 0, and then the calculation + of Skew is performed. And when the number of eligible + domains with matching topology keys equals or greater + than minDomains, this value has no effect on scheduling. + As a result, when the number of eligible domains is less + than minDomains, scheduler won't schedule more than maxSkew + Pods to those domains. If value is nil, the constraint + behaves as if MinDomains is equal to 1. Valid values are + integers greater than 0. When value is not nil, WhenUnsatisfiable + must be DoNotSchedule. \n For example, in a 3-zone cluster, + MaxSkew is set to 2, MinDomains is set to 5 and pods with + the same labelSelector spread as 2/2/2: | zone1 | zone2 + | zone3 | | P P | P P | P P | The number of domains + is less than 5(MinDomains), so \"global minimum\" is treated + as 0. In this situation, new pod with the same labelSelector + cannot be scheduled, because computed skew will be 3(3 + - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. \n This is a beta field and requires + the MinDomainsInPodTopologySpread feature gate to be enabled + (enabled by default)." + format: int32 + type: integer + nodeAffinityPolicy: + description: "NodeAffinityPolicy indicates how we will treat + Pod's nodeAffinity/nodeSelector when calculating pod topology + spread skew. Options are: - Honor: only nodes matching + nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes + are included in the calculations. \n If this value is + nil, the behavior is equivalent to the Honor policy. This + is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + nodeTaintsPolicy: + description: "NodeTaintsPolicy indicates how we will treat + node taints when calculating pod topology spread skew. + Options are: - Honor: nodes without taints, along with + tainted nodes for which the incoming pod has a toleration, + are included. - Ignore: node taints are ignored. All nodes + are included. \n If this value is nil, the behavior is + equivalent to the Ignore policy. This is a alpha-level + feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + topologyKey: + description: TopologyKey is the key of node labels. Nodes + that have a label with this key and identical values are + considered to be in the same topology. We consider each + as a "bucket", and try to put balanced number + of pods into each bucket. We define a domain as a particular + instance of a topology. Also, we define an eligible domain + as a domain whose nodes meet the requirements of nodeAffinityPolicy + and nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname", + each Node is a domain of that topology. And, if TopologyKey + is "topology.kubernetes.io/zone", each zone is a domain + of that topology. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with + a pod if it doesn''t satisfy the spread constraint. - + DoNotSchedule (default) tells the scheduler not to schedule + it. - ScheduleAnyway tells the scheduler to schedule the + pod in any location, but giving higher precedence to topologies + that would help reduce the skew. A constraint is considered + "Unsatisfiable" for an incoming pod if and only if every + possible node assignment for that pod would violate "MaxSkew" + on some topology. For example, in a 3-zone cluster, MaxSkew + is set to 1, and pods with the same labelSelector spread + as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming + pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). + In other words, the cluster can still be imbalanced, but + scheduler won''t make it *more* imbalanced. It''s a required + field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + disableOperatorDefaults: + description: Whether the operator should disable its default set + of TopologySpreadConstraints. Defaults to false. + type: boolean + type: object volumeMounts: description: Represents where to mount the volumes into the application container. diff --git a/bundle/manifests/runtime-component.clusterserviceversion.yaml b/bundle/manifests/runtime-component.clusterserviceversion.yaml index 237653d9..a272e29d 100644 --- a/bundle/manifests/runtime-component.clusterserviceversion.yaml +++ b/bundle/manifests/runtime-component.clusterserviceversion.yaml @@ -71,7 +71,7 @@ metadata: categories: Application Runtime certified: "true" containerImage: icr.io/appcafe/runtime-component-operator:daily - createdAt: "2023-10-27T14:38:16Z" + createdAt: "2023-11-01T20:37:23Z" description: Deploys any runtime component with dynamic and auto-tuning configuration olm.skipRange: '>=0.8.0 <1.3.0' operators.openshift.io/infrastructure-features: '["disconnected"]' @@ -142,6 +142,15 @@ spec: path: serviceAccount.mountToken x-descriptors: - urn:alm:descriptor:com.tectonic.ui:booleanSwitch + - description: The list of TopologySpreadConstraints for the application pod. + displayName: Constraints + path: topologySpreadConstraints.constraints + - description: Whether the operator should disable its default set of TopologySpreadConstraints. + Defaults to false. + displayName: Disable Operator Defaults + path: topologySpreadConstraints.disableOperatorDefaults + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:booleanSwitch - description: Name of the application. Defaults to the name of this custom resource. displayName: Application Name @@ -347,6 +356,8 @@ spec: path: statefulSet.storage.className x-descriptors: - urn:alm:descriptor:com.tectonic.ui:text + - displayName: Topology Spread Constraints + path: topologySpreadConstraints - description: The directory inside the container where this persisted storage will be bound to. displayName: Storage Mount Path diff --git a/common/types.go b/common/types.go index d971d87e..8717eceb 100644 --- a/common/types.go +++ b/common/types.go @@ -190,6 +190,11 @@ type BaseComponentServiceAccount interface { GetName() *string } +type BaseComponentTopologySpreadConstraints interface { + GetConstraints() *[]corev1.TopologySpreadConstraint + GetDisableOperatorDefaults() *bool +} + // BaseComponent represents basic kubernetes application type BaseComponent interface { GetApplicationImage() string @@ -222,6 +227,7 @@ type BaseComponent interface { GetGroupName() string GetRoute() BaseComponentRoute GetAffinity() BaseComponentAffinity + GetTopologySpreadConstraints() BaseComponentTopologySpreadConstraints GetSecurityContext() *corev1.SecurityContext GetManageTLS() *bool } diff --git a/config/crd/bases/rc.app.stacks_runtimecomponents.yaml b/config/crd/bases/rc.app.stacks_runtimecomponents.yaml index 2f0e90d0..b28faca1 100644 --- a/config/crd/bases/rc.app.stacks_runtimecomponents.yaml +++ b/config/crd/bases/rc.app.stacks_runtimecomponents.yaml @@ -5425,6 +5425,189 @@ spec: type: string type: object type: object + topologySpreadConstraints: + description: Defines the topology spread constraints + properties: + constraints: + description: The list of TopologySpreadConstraints for the application + pod. + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine + the number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: MatchLabelKeys is a set of pod label keys to + select the pods over which spreading will be calculated. + The keys are used to lookup values from the incoming pod + labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading + will be calculated for the incoming pod. Keys that don't + exist in the incoming pod labels will be ignored. A null + or empty list means only match against labelSelector. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: 'MaxSkew describes the degree to which pods + may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference between the number + of matching pods in the target topology and the global + minimum. The global minimum is the minimum number of matching + pods in an eligible domain or zero if the number of eligible + domains is less than MinDomains. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 2/2/1: In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | | P P | P P | P | - + if MaxSkew is 1, incoming pod can only be scheduled to + zone3 to become 2/2/2; scheduling it onto zone1(zone2) + would make the ActualSkew(3-1) on zone1(zone2) violate + MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled + onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to topologies that + satisfy it. It''s a required field. Default value is 1 + and 0 is not allowed.' + format: int32 + type: integer + minDomains: + description: "MinDomains indicates a minimum number of eligible + domains. When the number of eligible domains with matching + topology keys is less than minDomains, Pod Topology Spread + treats \"global minimum\" as 0, and then the calculation + of Skew is performed. And when the number of eligible + domains with matching topology keys equals or greater + than minDomains, this value has no effect on scheduling. + As a result, when the number of eligible domains is less + than minDomains, scheduler won't schedule more than maxSkew + Pods to those domains. If value is nil, the constraint + behaves as if MinDomains is equal to 1. Valid values are + integers greater than 0. When value is not nil, WhenUnsatisfiable + must be DoNotSchedule. \n For example, in a 3-zone cluster, + MaxSkew is set to 2, MinDomains is set to 5 and pods with + the same labelSelector spread as 2/2/2: | zone1 | zone2 + | zone3 | | P P | P P | P P | The number of domains + is less than 5(MinDomains), so \"global minimum\" is treated + as 0. In this situation, new pod with the same labelSelector + cannot be scheduled, because computed skew will be 3(3 + - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. \n This is a beta field and requires + the MinDomainsInPodTopologySpread feature gate to be enabled + (enabled by default)." + format: int32 + type: integer + nodeAffinityPolicy: + description: "NodeAffinityPolicy indicates how we will treat + Pod's nodeAffinity/nodeSelector when calculating pod topology + spread skew. Options are: - Honor: only nodes matching + nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes + are included in the calculations. \n If this value is + nil, the behavior is equivalent to the Honor policy. This + is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + nodeTaintsPolicy: + description: "NodeTaintsPolicy indicates how we will treat + node taints when calculating pod topology spread skew. + Options are: - Honor: nodes without taints, along with + tainted nodes for which the incoming pod has a toleration, + are included. - Ignore: node taints are ignored. All nodes + are included. \n If this value is nil, the behavior is + equivalent to the Ignore policy. This is a alpha-level + feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + topologyKey: + description: TopologyKey is the key of node labels. Nodes + that have a label with this key and identical values are + considered to be in the same topology. We consider each + as a "bucket", and try to put balanced number + of pods into each bucket. We define a domain as a particular + instance of a topology. Also, we define an eligible domain + as a domain whose nodes meet the requirements of nodeAffinityPolicy + and nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname", + each Node is a domain of that topology. And, if TopologyKey + is "topology.kubernetes.io/zone", each zone is a domain + of that topology. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with + a pod if it doesn''t satisfy the spread constraint. - + DoNotSchedule (default) tells the scheduler not to schedule + it. - ScheduleAnyway tells the scheduler to schedule the + pod in any location, but giving higher precedence to topologies + that would help reduce the skew. A constraint is considered + "Unsatisfiable" for an incoming pod if and only if every + possible node assignment for that pod would violate "MaxSkew" + on some topology. For example, in a 3-zone cluster, MaxSkew + is set to 1, and pods with the same labelSelector spread + as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming + pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). + In other words, the cluster can still be imbalanced, but + scheduler won''t make it *more* imbalanced. It''s a required + field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + disableOperatorDefaults: + description: Whether the operator should disable its default set + of TopologySpreadConstraints. Defaults to false. + type: boolean + type: object volumeMounts: description: Represents where to mount the volumes into the application container. diff --git a/config/manifests/bases/runtime-component.clusterserviceversion.yaml b/config/manifests/bases/runtime-component.clusterserviceversion.yaml index f18415fd..da71af42 100644 --- a/config/manifests/bases/runtime-component.clusterserviceversion.yaml +++ b/config/manifests/bases/runtime-component.clusterserviceversion.yaml @@ -76,6 +76,15 @@ spec: path: serviceAccount.mountToken x-descriptors: - urn:alm:descriptor:com.tectonic.ui:booleanSwitch + - description: The list of TopologySpreadConstraints for the application pod. + displayName: Constraints + path: topologySpreadConstraints.constraints + - description: Whether the operator should disable its default set of TopologySpreadConstraints. + Defaults to false. + displayName: Disable Operator Defaults + path: topologySpreadConstraints.disableOperatorDefaults + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:booleanSwitch - description: Name of the application. Defaults to the name of this custom resource. displayName: Application Name @@ -281,6 +290,8 @@ spec: path: statefulSet.storage.className x-descriptors: - urn:alm:descriptor:com.tectonic.ui:text + - displayName: Topology Spread Constraints + path: topologySpreadConstraints - description: The directory inside the container where this persisted storage will be bound to. displayName: Storage Mount Path diff --git a/internal/deploy/kubectl/runtime-component-crd.yaml b/internal/deploy/kubectl/runtime-component-crd.yaml index f58b0534..ded240cc 100644 --- a/internal/deploy/kubectl/runtime-component-crd.yaml +++ b/internal/deploy/kubectl/runtime-component-crd.yaml @@ -5427,6 +5427,189 @@ spec: type: string type: object type: object + topologySpreadConstraints: + description: Defines the topology spread constraints + properties: + constraints: + description: The list of TopologySpreadConstraints for the application + pod. + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine + the number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: MatchLabelKeys is a set of pod label keys to + select the pods over which spreading will be calculated. + The keys are used to lookup values from the incoming pod + labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading + will be calculated for the incoming pod. Keys that don't + exist in the incoming pod labels will be ignored. A null + or empty list means only match against labelSelector. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: 'MaxSkew describes the degree to which pods + may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference between the number + of matching pods in the target topology and the global + minimum. The global minimum is the minimum number of matching + pods in an eligible domain or zero if the number of eligible + domains is less than MinDomains. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 2/2/1: In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | | P P | P P | P | - + if MaxSkew is 1, incoming pod can only be scheduled to + zone3 to become 2/2/2; scheduling it onto zone1(zone2) + would make the ActualSkew(3-1) on zone1(zone2) violate + MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled + onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to topologies that + satisfy it. It''s a required field. Default value is 1 + and 0 is not allowed.' + format: int32 + type: integer + minDomains: + description: "MinDomains indicates a minimum number of eligible + domains. When the number of eligible domains with matching + topology keys is less than minDomains, Pod Topology Spread + treats \"global minimum\" as 0, and then the calculation + of Skew is performed. And when the number of eligible + domains with matching topology keys equals or greater + than minDomains, this value has no effect on scheduling. + As a result, when the number of eligible domains is less + than minDomains, scheduler won't schedule more than maxSkew + Pods to those domains. If value is nil, the constraint + behaves as if MinDomains is equal to 1. Valid values are + integers greater than 0. When value is not nil, WhenUnsatisfiable + must be DoNotSchedule. \n For example, in a 3-zone cluster, + MaxSkew is set to 2, MinDomains is set to 5 and pods with + the same labelSelector spread as 2/2/2: | zone1 | zone2 + | zone3 | | P P | P P | P P | The number of domains + is less than 5(MinDomains), so \"global minimum\" is treated + as 0. In this situation, new pod with the same labelSelector + cannot be scheduled, because computed skew will be 3(3 + - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. \n This is a beta field and requires + the MinDomainsInPodTopologySpread feature gate to be enabled + (enabled by default)." + format: int32 + type: integer + nodeAffinityPolicy: + description: "NodeAffinityPolicy indicates how we will treat + Pod's nodeAffinity/nodeSelector when calculating pod topology + spread skew. Options are: - Honor: only nodes matching + nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes + are included in the calculations. \n If this value is + nil, the behavior is equivalent to the Honor policy. This + is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + nodeTaintsPolicy: + description: "NodeTaintsPolicy indicates how we will treat + node taints when calculating pod topology spread skew. + Options are: - Honor: nodes without taints, along with + tainted nodes for which the incoming pod has a toleration, + are included. - Ignore: node taints are ignored. All nodes + are included. \n If this value is nil, the behavior is + equivalent to the Ignore policy. This is a alpha-level + feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + topologyKey: + description: TopologyKey is the key of node labels. Nodes + that have a label with this key and identical values are + considered to be in the same topology. We consider each + as a "bucket", and try to put balanced number + of pods into each bucket. We define a domain as a particular + instance of a topology. Also, we define an eligible domain + as a domain whose nodes meet the requirements of nodeAffinityPolicy + and nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname", + each Node is a domain of that topology. And, if TopologyKey + is "topology.kubernetes.io/zone", each zone is a domain + of that topology. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with + a pod if it doesn''t satisfy the spread constraint. - + DoNotSchedule (default) tells the scheduler not to schedule + it. - ScheduleAnyway tells the scheduler to schedule the + pod in any location, but giving higher precedence to topologies + that would help reduce the skew. A constraint is considered + "Unsatisfiable" for an incoming pod if and only if every + possible node assignment for that pod would violate "MaxSkew" + on some topology. For example, in a 3-zone cluster, MaxSkew + is set to 1, and pods with the same labelSelector spread + as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming + pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). + In other words, the cluster can still be imbalanced, but + scheduler won''t make it *more* imbalanced. It''s a required + field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + disableOperatorDefaults: + description: Whether the operator should disable its default set + of TopologySpreadConstraints. Defaults to false. + type: boolean + type: object volumeMounts: description: Represents where to mount the volumes into the application container. diff --git a/internal/deploy/kustomize/daily/base/runtime-component-crd.yaml b/internal/deploy/kustomize/daily/base/runtime-component-crd.yaml index f58b0534..ded240cc 100644 --- a/internal/deploy/kustomize/daily/base/runtime-component-crd.yaml +++ b/internal/deploy/kustomize/daily/base/runtime-component-crd.yaml @@ -5427,6 +5427,189 @@ spec: type: string type: object type: object + topologySpreadConstraints: + description: Defines the topology spread constraints + properties: + constraints: + description: The list of TopologySpreadConstraints for the application + pod. + items: + description: TopologySpreadConstraint specifies how to spread + matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. + Pods that match this label selector are counted to determine + the number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + matchLabelKeys: + description: MatchLabelKeys is a set of pod label keys to + select the pods over which spreading will be calculated. + The keys are used to lookup values from the incoming pod + labels, those key-value labels are ANDed with labelSelector + to select the group of existing pods over which spreading + will be calculated for the incoming pod. Keys that don't + exist in the incoming pod labels will be ignored. A null + or empty list means only match against labelSelector. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxSkew: + description: 'MaxSkew describes the degree to which pods + may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference between the number + of matching pods in the target topology and the global + minimum. The global minimum is the minimum number of matching + pods in an eligible domain or zero if the number of eligible + domains is less than MinDomains. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 2/2/1: In this case, the global minimum is 1. + | zone1 | zone2 | zone3 | | P P | P P | P | - + if MaxSkew is 1, incoming pod can only be scheduled to + zone3 to become 2/2/2; scheduling it onto zone1(zone2) + would make the ActualSkew(3-1) on zone1(zone2) violate + MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled + onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to topologies that + satisfy it. It''s a required field. Default value is 1 + and 0 is not allowed.' + format: int32 + type: integer + minDomains: + description: "MinDomains indicates a minimum number of eligible + domains. When the number of eligible domains with matching + topology keys is less than minDomains, Pod Topology Spread + treats \"global minimum\" as 0, and then the calculation + of Skew is performed. And when the number of eligible + domains with matching topology keys equals or greater + than minDomains, this value has no effect on scheduling. + As a result, when the number of eligible domains is less + than minDomains, scheduler won't schedule more than maxSkew + Pods to those domains. If value is nil, the constraint + behaves as if MinDomains is equal to 1. Valid values are + integers greater than 0. When value is not nil, WhenUnsatisfiable + must be DoNotSchedule. \n For example, in a 3-zone cluster, + MaxSkew is set to 2, MinDomains is set to 5 and pods with + the same labelSelector spread as 2/2/2: | zone1 | zone2 + | zone3 | | P P | P P | P P | The number of domains + is less than 5(MinDomains), so \"global minimum\" is treated + as 0. In this situation, new pod with the same labelSelector + cannot be scheduled, because computed skew will be 3(3 + - 0) if new Pod is scheduled to any of the three zones, + it will violate MaxSkew. \n This is a beta field and requires + the MinDomainsInPodTopologySpread feature gate to be enabled + (enabled by default)." + format: int32 + type: integer + nodeAffinityPolicy: + description: "NodeAffinityPolicy indicates how we will treat + Pod's nodeAffinity/nodeSelector when calculating pod topology + spread skew. Options are: - Honor: only nodes matching + nodeAffinity/nodeSelector are included in the calculations. + - Ignore: nodeAffinity/nodeSelector are ignored. All nodes + are included in the calculations. \n If this value is + nil, the behavior is equivalent to the Honor policy. This + is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + nodeTaintsPolicy: + description: "NodeTaintsPolicy indicates how we will treat + node taints when calculating pod topology spread skew. + Options are: - Honor: nodes without taints, along with + tainted nodes for which the incoming pod has a toleration, + are included. - Ignore: node taints are ignored. All nodes + are included. \n If this value is nil, the behavior is + equivalent to the Ignore policy. This is a alpha-level + feature enabled by the NodeInclusionPolicyInPodTopologySpread + feature flag." + type: string + topologyKey: + description: TopologyKey is the key of node labels. Nodes + that have a label with this key and identical values are + considered to be in the same topology. We consider each + as a "bucket", and try to put balanced number + of pods into each bucket. We define a domain as a particular + instance of a topology. Also, we define an eligible domain + as a domain whose nodes meet the requirements of nodeAffinityPolicy + and nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname", + each Node is a domain of that topology. And, if TopologyKey + is "topology.kubernetes.io/zone", each zone is a domain + of that topology. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with + a pod if it doesn''t satisfy the spread constraint. - + DoNotSchedule (default) tells the scheduler not to schedule + it. - ScheduleAnyway tells the scheduler to schedule the + pod in any location, but giving higher precedence to topologies + that would help reduce the skew. A constraint is considered + "Unsatisfiable" for an incoming pod if and only if every + possible node assignment for that pod would violate "MaxSkew" + on some topology. For example, in a 3-zone cluster, MaxSkew + is set to 1, and pods with the same labelSelector spread + as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming + pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). + In other words, the cluster can still be imbalanced, but + scheduler won''t make it *more* imbalanced. It''s a required + field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + disableOperatorDefaults: + description: Whether the operator should disable its default set + of TopologySpreadConstraints. Defaults to false. + type: boolean + type: object volumeMounts: description: Represents where to mount the volumes into the application container. diff --git a/utils/utils.go b/utils/utils.go index 81bc66a2..25bfd679 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -724,6 +724,15 @@ func CustomizePodSpec(pts *corev1.PodTemplateSpec, ba common.BaseComponent) { pts.Spec.RestartPolicy = corev1.RestartPolicyAlways pts.Spec.DNSPolicy = corev1.DNSClusterFirst + pts.Spec.TopologySpreadConstraints = make([]corev1.TopologySpreadConstraint, 0) + topologySpreadConstraintsConfig := ba.GetTopologySpreadConstraints() + if topologySpreadConstraintsConfig == nil || topologySpreadConstraintsConfig.GetDisableOperatorDefaults() == nil || !*topologySpreadConstraintsConfig.GetDisableOperatorDefaults() { + CustomizeTopologySpreadConstraints(pts, map[string]string{"app.kubernetes.io/instance": obj.GetName()}) + } + if topologySpreadConstraintsConfig != nil && topologySpreadConstraintsConfig.GetConstraints() != nil { + pts.Spec.TopologySpreadConstraints = MergeTopologySpreadConstraints(pts.Spec.TopologySpreadConstraints, *topologySpreadConstraintsConfig.GetConstraints()) + } + pts.Spec.Affinity = &corev1.Affinity{} CustomizeAffinity(pts.Spec.Affinity, ba) @@ -738,6 +747,28 @@ func CustomizePodSpec(pts *corev1.PodTemplateSpec, ba common.BaseComponent) { pts.Spec.AutomountServiceAccountToken = &mount } +// Initialize an empty TopologySpreadConstraints list and optionally prefers scheduling across zones/hosts for pods with podMatchLabels +func CustomizeTopologySpreadConstraints(pts *corev1.PodTemplateSpec, podMatchLabels map[string]string) { + if len(podMatchLabels) > 0 { + pts.Spec.TopologySpreadConstraints = append(pts.Spec.TopologySpreadConstraints, corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: "topology.kubernetes.io/zone", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: podMatchLabels, + }, + }) + pts.Spec.TopologySpreadConstraints = append(pts.Spec.TopologySpreadConstraints, corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: "kubernetes.io/hostname", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: podMatchLabels, + }, + }) + } +} + // CustomizePersistence ... func CustomizePersistence(statefulSet *appsv1.StatefulSet, ba common.BaseComponent) { obj, ss := ba.(metav1.Object), ba.GetStatefulSet() @@ -1215,6 +1246,25 @@ func GetWatchNamespaces() ([]string, error) { return watchNamespaces, nil } +// MergeTopologySpreadConstraints returns the union of all TopologySpreadConstraint lists. The order of lists passed into the +// func, defines the importance. +func MergeTopologySpreadConstraints(constraints ...[]corev1.TopologySpreadConstraint) []corev1.TopologySpreadConstraint { + dest := make([]corev1.TopologySpreadConstraint, 0) + destTopologyKeyMemo := make(map[string]int) + + for i := range constraints { + for j, constraint := range constraints[i] { + if t, found := destTopologyKeyMemo[constraint.TopologyKey]; found { + dest[t] = constraint + } else { + dest = append(dest, constraint) + destTopologyKeyMemo[constraint.TopologyKey] = j + } + } + } + return dest +} + // MergeMaps returns a map containing the union of al the key-value pairs from the input maps. The order of the maps passed into the // func, defines the importance. e.g. if (keyA, value1) is in map1, and (keyA, value2) is in map2, mergeMaps(map1, map2) would contain (keyA, value2). // If the input map is nil, it is treated as empty map.