diff --git a/cluster-autoscaler/simulator/predicates.go b/cluster-autoscaler/simulator/predicates.go index 2958fabdc098..35e04ae2aeea 100644 --- a/cluster-autoscaler/simulator/predicates.go +++ b/cluster-autoscaler/simulator/predicates.go @@ -21,10 +21,14 @@ import ( "strings" apiv1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" informers "k8s.io/client-go/informers" kube_client "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/pkg/scheduler" "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" "k8s.io/kubernetes/pkg/scheduler/factory" schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo" @@ -64,36 +68,79 @@ func init() { algorithmprovider.ApplyFeatureGates() } +// NoOpEventRecorder is a noop implementation of EventRecorder +type NoOpEventRecorder struct{} + +// Event is a noop method implementation +func (NoOpEventRecorder) Event(object runtime.Object, eventtype, reason, message string) { +} + +// Eventf is a noop method implementation +func (NoOpEventRecorder) Eventf(object runtime.Object, eventtype, reason, messageFmt string, args ...interface{}) { +} + +// PastEventf is a noop method implementation +func (NoOpEventRecorder) PastEventf(object runtime.Object, timestamp metav1.Time, eventtype, reason, messageFmt string, args ...interface{}) { +} + +// AnnotatedEventf is a noop method implementation +func (NoOpEventRecorder) AnnotatedEventf(object runtime.Object, annotations map[string]string, eventtype, reason, messageFmt string, args ...interface{}) { +} + // NewPredicateChecker builds PredicateChecker. func NewPredicateChecker(kubeClient kube_client.Interface, stop <-chan struct{}) (*PredicateChecker, error) { - provider, err := factory.GetAlgorithmProvider(factory.DefaultProvider) - if err != nil { - return nil, err - } informerFactory := informers.NewSharedInformerFactory(kubeClient, 0) - - schedulerConfigFactory := factory.NewConfigFactory(&factory.ConfigFactoryArgs{ - SchedulerName: "cluster-autoscaler", + algorithmProvider := factory.DefaultProvider + + // Set up the configurator which can create schedulers from configs. + nodeInformer := informerFactory.Core().V1().Nodes() + podInformer := informerFactory.Core().V1().Pods() + pvInformer := informerFactory.Core().V1().PersistentVolumes() + pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims() + replicationControllerInformer := informerFactory.Core().V1().ReplicationControllers() + replicaSetInformer := informerFactory.Apps().V1().ReplicaSets() + statefulSetInformer := informerFactory.Apps().V1().StatefulSets() + serviceInformer := informerFactory.Core().V1().Services() + pdbInformer := informerFactory.Policy().V1beta1().PodDisruptionBudgets() + storageClassInformer := informerFactory.Storage().V1().StorageClasses() + configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{ + SchedulerName: apiv1.DefaultSchedulerName, Client: kubeClient, - NodeInformer: informerFactory.Core().V1().Nodes(), - PodInformer: informerFactory.Core().V1().Pods(), - PvInformer: informerFactory.Core().V1().PersistentVolumes(), - PvcInformer: informerFactory.Core().V1().PersistentVolumeClaims(), - ReplicationControllerInformer: informerFactory.Core().V1().ReplicationControllers(), - ReplicaSetInformer: informerFactory.Apps().V1().ReplicaSets(), - StatefulSetInformer: informerFactory.Apps().V1().StatefulSets(), - ServiceInformer: informerFactory.Core().V1().Services(), - PdbInformer: informerFactory.Policy().V1beta1().PodDisruptionBudgets(), - StorageClassInformer: informerFactory.Storage().V1().StorageClasses(), + NodeInformer: nodeInformer, + PodInformer: podInformer, + PvInformer: pvInformer, + PvcInformer: pvcInformer, + ReplicationControllerInformer: replicationControllerInformer, + ReplicaSetInformer: replicaSetInformer, + StatefulSetInformer: statefulSetInformer, + ServiceInformer: serviceInformer, + PdbInformer: pdbInformer, + StorageClassInformer: storageClassInformer, HardPodAffinitySymmetricWeight: apiv1.DefaultHardPodAffinitySymmetricWeight, + DisablePreemption: false, + PercentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore, + BindTimeoutSeconds: scheduler.BindTimeoutSeconds, }) - informerFactory.Start(stop) - - metadataProducer, err := schedulerConfigFactory.GetPredicateMetadataProducer() + // Create the config from a named algorithm provider. + config, err := configurator.CreateFromProvider(algorithmProvider) if err != nil { - return nil, err + return nil, fmt.Errorf("couldn't create scheduler using provider %q: %v", algorithmProvider, err) + } + // Additional tweaks to the config produced by the configurator. + config.Recorder = NoOpEventRecorder{} + config.DisablePreemption = false + config.StopEverything = stop + + // Create the scheduler. + sched := scheduler.NewFromConfig(config) + + scheduler.AddAllEventHandlers(sched, apiv1.DefaultSchedulerName, + nodeInformer, podInformer, pvInformer, pvcInformer, replicationControllerInformer, replicaSetInformer, statefulSetInformer, serviceInformer, pdbInformer, storageClassInformer) + + predicateMap := map[string]predicates.FitPredicate{} + for predicateName, predicateFunc := range sched.Config().Algorithm.Predicates() { + predicateMap[predicateName] = predicateFunc } - predicateMap, err := schedulerConfigFactory.GetPredicates(provider.FitPredicateKeys) predicateMap["ready"] = isNodeReadyAndSchedulablePredicate if err != nil { return nil, err @@ -121,8 +168,12 @@ func NewPredicateChecker(kubeClient kube_client.Interface, stop <-chan struct{}) klog.V(1).Infof("Using predicate %s", predInfo.name) } - // TODO: Verify that run is not needed anymore. - // schedulerConfigFactory.Run() + informerFactory.Start(stop) + + metadataProducer, err := configurator.GetPredicateMetadataProducer() + if err != nil { + return nil, fmt.Errorf("could not obtain predicateMetadataProducer; %v", err.Error()) + } return &PredicateChecker{ predicates: predicateList, diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/latest/latest.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/latest/latest.go new file mode 100644 index 000000000000..97696d57be17 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/latest/latest.go @@ -0,0 +1,54 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package latest + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer/json" + "k8s.io/apimachinery/pkg/runtime/serializer/versioning" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + // Init the api v1 package + _ "k8s.io/kubernetes/pkg/scheduler/api/v1" +) + +// Version is the string that represents the current external default version. +const Version = "v1" + +// OldestVersion is the string that represents the oldest server version supported. +const OldestVersion = "v1" + +// Versions is the list of versions that are recognized in code. The order provided +// may be assumed to be least feature rich to most feature rich, and clients may +// choose to prefer the latter items in the list over the former items when presented +// with a set of versions to choose. +var Versions = []string{"v1"} + +// Codec is the default codec for serializing input that should use +// the latest supported version. It supports JSON by default. +var Codec runtime.Codec + +func init() { + jsonSerializer := json.NewSerializer(json.DefaultMetaFactory, schedulerapi.Scheme, schedulerapi.Scheme, true) + Codec = versioning.NewDefaultingCodecForScheme( + schedulerapi.Scheme, + jsonSerializer, + jsonSerializer, + schema.GroupVersion{Version: Version}, + runtime.InternalGroupVersioner, + ) +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/doc.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/doc.go new file mode 100644 index 000000000000..3386c4d8d214 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +k8s:deepcopy-gen=package + +// Package v1 contains scheduler API objects. +package v1 // import "k8s.io/kubernetes/pkg/scheduler/api/v1" diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/register.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/register.go new file mode 100644 index 000000000000..504de9c7672b --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/register.go @@ -0,0 +1,59 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" +) + +// SchemeGroupVersion is group version used to register these objects +// TODO this should be in the "scheduler" group +var SchemeGroupVersion = schema.GroupVersion{Group: "", Version: "v1"} + +func init() { + if err := addKnownTypes(schedulerapi.Scheme); err != nil { + // Programmer error. + panic(err) + } +} + +var ( + // TODO: move SchemeBuilder with zz_generated.deepcopy.go to k8s.io/api. + // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. + + // SchemeBuilder is a v1 api scheme builder. + SchemeBuilder runtime.SchemeBuilder + localSchemeBuilder = &SchemeBuilder + // AddToScheme is used to add stored functions to scheme. + AddToScheme = localSchemeBuilder.AddToScheme +) + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + localSchemeBuilder.Register(addKnownTypes) +} + +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &Policy{}, + ) + return nil +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/types.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/types.go new file mode 100644 index 000000000000..f933a6c5174d --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/types.go @@ -0,0 +1,346 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + gojson "encoding/json" + "time" + + apiv1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// Policy describes a struct for a policy resource used in api. +type Policy struct { + metav1.TypeMeta `json:",inline"` + // Holds the information to configure the fit predicate functions + Predicates []PredicatePolicy `json:"predicates"` + // Holds the information to configure the priority functions + Priorities []PriorityPolicy `json:"priorities"` + // Holds the information to communicate with the extender(s) + ExtenderConfigs []ExtenderConfig `json:"extenders"` + // RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule + // corresponding to every RequiredDuringScheduling affinity rule. + // HardPodAffinitySymmetricWeight represents the weight of implicit PreferredDuringScheduling affinity rule, in the range 1-100. + HardPodAffinitySymmetricWeight int `json:"hardPodAffinitySymmetricWeight"` + + // When AlwaysCheckAllPredicates is set to true, scheduler checks all + // the configured predicates even after one or more of them fails. + // When the flag is set to false, scheduler skips checking the rest + // of the predicates after it finds one predicate that failed. + AlwaysCheckAllPredicates bool `json:"alwaysCheckAllPredicates"` +} + +// PredicatePolicy describes a struct of a predicate policy. +type PredicatePolicy struct { + // Identifier of the predicate policy + // For a custom predicate, the name can be user-defined + // For the Kubernetes provided predicates, the name is the identifier of the pre-defined predicate + Name string `json:"name"` + // Holds the parameters to configure the given predicate + Argument *PredicateArgument `json:"argument"` +} + +// PriorityPolicy describes a struct of a priority policy. +type PriorityPolicy struct { + // Identifier of the priority policy + // For a custom priority, the name can be user-defined + // For the Kubernetes provided priority functions, the name is the identifier of the pre-defined priority function + Name string `json:"name"` + // The numeric multiplier for the node scores that the priority function generates + // The weight should be non-zero and can be a positive or a negative integer + Weight int `json:"weight"` + // Holds the parameters to configure the given priority function + Argument *PriorityArgument `json:"argument"` +} + +// PredicateArgument represents the arguments to configure predicate functions in scheduler policy configuration. +// Only one of its members may be specified +type PredicateArgument struct { + // The predicate that provides affinity for pods belonging to a service + // It uses a label to identify nodes that belong to the same "group" + ServiceAffinity *ServiceAffinity `json:"serviceAffinity"` + // The predicate that checks whether a particular node has a certain label + // defined or not, regardless of value + LabelsPresence *LabelsPresence `json:"labelsPresence"` +} + +// PriorityArgument represents the arguments to configure priority functions in scheduler policy configuration. +// Only one of its members may be specified +type PriorityArgument struct { + // The priority function that ensures a good spread (anti-affinity) for pods belonging to a service + // It uses a label to identify nodes that belong to the same "group" + ServiceAntiAffinity *ServiceAntiAffinity `json:"serviceAntiAffinity"` + // The priority function that checks whether a particular node has a certain label + // defined or not, regardless of value + LabelPreference *LabelPreference `json:"labelPreference"` + // The RequestedToCapacityRatio priority function is parametrized with function shape. + RequestedToCapacityRatioArguments *RequestedToCapacityRatioArguments `json:"requestedToCapacityRatioArguments"` +} + +// ServiceAffinity holds the parameters that are used to configure the corresponding predicate in scheduler policy configuration. +type ServiceAffinity struct { + // The list of labels that identify node "groups" + // All of the labels should match for the node to be considered a fit for hosting the pod + Labels []string `json:"labels"` +} + +// LabelsPresence holds the parameters that are used to configure the corresponding predicate in scheduler policy configuration. +type LabelsPresence struct { + // The list of labels that identify node "groups" + // All of the labels should be either present (or absent) for the node to be considered a fit for hosting the pod + Labels []string `json:"labels"` + // The boolean flag that indicates whether the labels should be present or absent from the node + Presence bool `json:"presence"` +} + +// ServiceAntiAffinity holds the parameters that are used to configure the corresponding priority function +type ServiceAntiAffinity struct { + // Used to identify node "groups" + Label string `json:"label"` +} + +// LabelPreference holds the parameters that are used to configure the corresponding priority function +type LabelPreference struct { + // Used to identify node "groups" + Label string `json:"label"` + // This is a boolean flag + // If true, higher priority is given to nodes that have the label + // If false, higher priority is given to nodes that do not have the label + Presence bool `json:"presence"` +} + +// RequestedToCapacityRatioArguments holds arguments specific to RequestedToCapacityRatio priority function +type RequestedToCapacityRatioArguments struct { + // Array of point defining priority function shape + UtilizationShape []UtilizationShapePoint `json:"shape"` +} + +// UtilizationShapePoint represents single point of priority function shape +type UtilizationShapePoint struct { + // Utilization (x axis). Valid values are 0 to 100. Fully utilized node maps to 100. + Utilization int `json:"utilization"` + // Score assigned to given utilization (y axis). Valid values are 0 to 10. + Score int `json:"score"` +} + +// ExtenderManagedResource describes the arguments of extended resources +// managed by an extender. +type ExtenderManagedResource struct { + // Name is the extended resource name. + Name apiv1.ResourceName `json:"name,casttype=ResourceName"` + // IgnoredByScheduler indicates whether kube-scheduler should ignore this + // resource when applying predicates. + IgnoredByScheduler bool `json:"ignoredByScheduler,omitempty"` +} + +// ExtenderTLSConfig contains settings to enable TLS with extender +type ExtenderTLSConfig struct { + // Server should be accessed without verifying the TLS certificate. For testing only. + Insecure bool `json:"insecure,omitempty"` + // ServerName is passed to the server for SNI and is used in the client to check server + // certificates against. If ServerName is empty, the hostname used to contact the + // server is used. + ServerName string `json:"serverName,omitempty"` + + // Server requires TLS client certificate authentication + CertFile string `json:"certFile,omitempty"` + // Server requires TLS client certificate authentication + KeyFile string `json:"keyFile,omitempty"` + // Trusted root certificates for server + CAFile string `json:"caFile,omitempty"` + + // CertData holds PEM-encoded bytes (typically read from a client certificate file). + // CertData takes precedence over CertFile + CertData []byte `json:"certData,omitempty"` + // KeyData holds PEM-encoded bytes (typically read from a client certificate key file). + // KeyData takes precedence over KeyFile + KeyData []byte `json:"keyData,omitempty"` + // CAData holds PEM-encoded bytes (typically read from a root certificates bundle). + // CAData takes precedence over CAFile + CAData []byte `json:"caData,omitempty"` +} + +// ExtenderConfig holds the parameters used to communicate with the extender. If a verb is unspecified/empty, +// it is assumed that the extender chose not to provide that extension. +type ExtenderConfig struct { + // URLPrefix at which the extender is available + URLPrefix string `json:"urlPrefix"` + // Verb for the filter call, empty if not supported. This verb is appended to the URLPrefix when issuing the filter call to extender. + FilterVerb string `json:"filterVerb,omitempty"` + // Verb for the preempt call, empty if not supported. This verb is appended to the URLPrefix when issuing the preempt call to extender. + PreemptVerb string `json:"preemptVerb,omitempty"` + // Verb for the prioritize call, empty if not supported. This verb is appended to the URLPrefix when issuing the prioritize call to extender. + PrioritizeVerb string `json:"prioritizeVerb,omitempty"` + // The numeric multiplier for the node scores that the prioritize call generates. + // The weight should be a positive integer + Weight int `json:"weight,omitempty"` + // Verb for the bind call, empty if not supported. This verb is appended to the URLPrefix when issuing the bind call to extender. + // If this method is implemented by the extender, it is the extender's responsibility to bind the pod to apiserver. Only one extender + // can implement this function. + BindVerb string `json:"bindVerb,omitempty"` + // EnableHTTPS specifies whether https should be used to communicate with the extender + EnableHTTPS bool `json:"enableHttps,omitempty"` + // TLSConfig specifies the transport layer security config + TLSConfig *ExtenderTLSConfig `json:"tlsConfig,omitempty"` + // HTTPTimeout specifies the timeout duration for a call to the extender. Filter timeout fails the scheduling of the pod. Prioritize + // timeout is ignored, k8s/other extenders priorities are used to select the node. + HTTPTimeout time.Duration `json:"httpTimeout,omitempty"` + // NodeCacheCapable specifies that the extender is capable of caching node information, + // so the scheduler should only send minimal information about the eligible nodes + // assuming that the extender already cached full details of all nodes in the cluster + NodeCacheCapable bool `json:"nodeCacheCapable,omitempty"` + // ManagedResources is a list of extended resources that are managed by + // this extender. + // - A pod will be sent to the extender on the Filter, Prioritize and Bind + // (if the extender is the binder) phases iff the pod requests at least + // one of the extended resources in this list. If empty or unspecified, + // all pods will be sent to this extender. + // - If IgnoredByScheduler is set to true for a resource, kube-scheduler + // will skip checking the resource in predicates. + // +optional + ManagedResources []ExtenderManagedResource `json:"managedResources,omitempty"` + // Ignorable specifies if the extender is ignorable, i.e. scheduling should not + // fail when the extender returns an error or is not reachable. + Ignorable bool `json:"ignorable,omitempty"` +} + +// caseInsensitiveExtenderConfig is a type alias which lets us use the stdlib case-insensitive decoding +// to preserve compatibility with incorrectly specified scheduler config fields: +// * BindVerb, which originally did not specify a json tag, and required upper-case serialization in 1.7 +// * TLSConfig, which uses a struct not intended for serialization, and does not include any json tags +type caseInsensitiveExtenderConfig *ExtenderConfig + +// UnmarshalJSON implements the json.Unmarshaller interface. +// This preserves compatibility with incorrect case-insensitive configuration fields. +func (t *ExtenderConfig) UnmarshalJSON(b []byte) error { + return gojson.Unmarshal(b, caseInsensitiveExtenderConfig(t)) +} + +// ExtenderArgs represents the arguments needed by the extender to filter/prioritize +// nodes for a pod. +type ExtenderArgs struct { + // Pod being scheduled + Pod *apiv1.Pod `json:"pod"` + // List of candidate nodes where the pod can be scheduled; to be populated + // only if ExtenderConfig.NodeCacheCapable == false + Nodes *apiv1.NodeList `json:"nodes,omitempty"` + // List of candidate node names where the pod can be scheduled; to be + // populated only if ExtenderConfig.NodeCacheCapable == true + NodeNames *[]string `json:"nodenames,omitempty"` +} + +// ExtenderPreemptionResult represents the result returned by preemption phase of extender. +type ExtenderPreemptionResult struct { + NodeNameToMetaVictims map[string]*MetaVictims `json:"nodeNameToMetaVictims,omitempty"` +} + +// ExtenderPreemptionArgs represents the arguments needed by the extender to preempt pods on nodes. +type ExtenderPreemptionArgs struct { + // Pod being scheduled + Pod *apiv1.Pod `json:"pod"` + // Victims map generated by scheduler preemption phase + // Only set NodeNameToMetaVictims if ExtenderConfig.NodeCacheCapable == true. Otherwise, only set NodeNameToVictims. + NodeNameToVictims map[string]*Victims `json:"nodeToVictims,omitempty"` + NodeNameToMetaVictims map[string]*MetaVictims `json:"nodeNameToMetaVictims,omitempty"` +} + +// Victims represents: +// pods: a group of pods expected to be preempted. +// numPDBViolations: the count of violations of PodDisruptionBudget +type Victims struct { + Pods []*apiv1.Pod `json:"pods"` + NumPDBViolations int `json:"numPDBViolations"` +} + +// MetaPod represent identifier for a v1.Pod +type MetaPod struct { + UID string `json:"uid"` +} + +// MetaVictims represents: +// pods: a group of pods expected to be preempted. +// Only Pod identifiers will be sent and user are expect to get v1.Pod in their own way. +// numPDBViolations: the count of violations of PodDisruptionBudget +type MetaVictims struct { + Pods []*MetaPod `json:"pods"` + NumPDBViolations int `json:"numPDBViolations"` +} + +// FailedNodesMap represents the filtered out nodes, with node names and failure messages +type FailedNodesMap map[string]string + +// ExtenderFilterResult represents the results of a filter call to an extender +type ExtenderFilterResult struct { + // Filtered set of nodes where the pod can be scheduled; to be populated + // only if ExtenderConfig.NodeCacheCapable == false + Nodes *apiv1.NodeList `json:"nodes,omitempty"` + // Filtered set of nodes where the pod can be scheduled; to be populated + // only if ExtenderConfig.NodeCacheCapable == true + NodeNames *[]string `json:"nodenames,omitempty"` + // Filtered out nodes where the pod can't be scheduled and the failure messages + FailedNodes FailedNodesMap `json:"failedNodes,omitempty"` + // Error message indicating failure + Error string `json:"error,omitempty"` +} + +// ExtenderBindingArgs represents the arguments to an extender for binding a pod to a node. +type ExtenderBindingArgs struct { + // PodName is the name of the pod being bound + PodName string + // PodNamespace is the namespace of the pod being bound + PodNamespace string + // PodUID is the UID of the pod being bound + PodUID types.UID + // Node selected by the scheduler + Node string +} + +// ExtenderBindingResult represents the result of binding of a pod to a node from an extender. +type ExtenderBindingResult struct { + // Error message indicating failure + Error string +} + +// HostPriority represents the priority of scheduling to a particular host, higher priority is better. +type HostPriority struct { + // Name of the host + Host string `json:"host"` + // Score associated with the host + Score int `json:"score"` +} + +// HostPriorityList declares a []HostPriority type. +type HostPriorityList []HostPriority + +func (h HostPriorityList) Len() int { + return len(h) +} + +func (h HostPriorityList) Less(i, j int) bool { + if h[i].Score == h[j].Score { + return h[i].Host < h[j].Host + } + return h[i].Score < h[j].Score +} + +func (h HostPriorityList) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/zz_generated.deepcopy.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/zz_generated.deepcopy.go new file mode 100644 index 000000000000..b201de16a0c4 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/api/v1/zz_generated.deepcopy.go @@ -0,0 +1,669 @@ +// +build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package v1 + +import ( + corev1 "k8s.io/api/core/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderArgs) DeepCopyInto(out *ExtenderArgs) { + *out = *in + if in.Pod != nil { + in, out := &in.Pod, &out.Pod + *out = new(corev1.Pod) + (*in).DeepCopyInto(*out) + } + if in.Nodes != nil { + in, out := &in.Nodes, &out.Nodes + *out = new(corev1.NodeList) + (*in).DeepCopyInto(*out) + } + if in.NodeNames != nil { + in, out := &in.NodeNames, &out.NodeNames + *out = new([]string) + if **in != nil { + in, out := *in, *out + *out = make([]string, len(*in)) + copy(*out, *in) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderArgs. +func (in *ExtenderArgs) DeepCopy() *ExtenderArgs { + if in == nil { + return nil + } + out := new(ExtenderArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderBindingArgs) DeepCopyInto(out *ExtenderBindingArgs) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderBindingArgs. +func (in *ExtenderBindingArgs) DeepCopy() *ExtenderBindingArgs { + if in == nil { + return nil + } + out := new(ExtenderBindingArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderBindingResult) DeepCopyInto(out *ExtenderBindingResult) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderBindingResult. +func (in *ExtenderBindingResult) DeepCopy() *ExtenderBindingResult { + if in == nil { + return nil + } + out := new(ExtenderBindingResult) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderConfig) DeepCopyInto(out *ExtenderConfig) { + *out = *in + if in.TLSConfig != nil { + in, out := &in.TLSConfig, &out.TLSConfig + *out = new(ExtenderTLSConfig) + (*in).DeepCopyInto(*out) + } + if in.ManagedResources != nil { + in, out := &in.ManagedResources, &out.ManagedResources + *out = make([]ExtenderManagedResource, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderConfig. +func (in *ExtenderConfig) DeepCopy() *ExtenderConfig { + if in == nil { + return nil + } + out := new(ExtenderConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderFilterResult) DeepCopyInto(out *ExtenderFilterResult) { + *out = *in + if in.Nodes != nil { + in, out := &in.Nodes, &out.Nodes + *out = new(corev1.NodeList) + (*in).DeepCopyInto(*out) + } + if in.NodeNames != nil { + in, out := &in.NodeNames, &out.NodeNames + *out = new([]string) + if **in != nil { + in, out := *in, *out + *out = make([]string, len(*in)) + copy(*out, *in) + } + } + if in.FailedNodes != nil { + in, out := &in.FailedNodes, &out.FailedNodes + *out = make(FailedNodesMap, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderFilterResult. +func (in *ExtenderFilterResult) DeepCopy() *ExtenderFilterResult { + if in == nil { + return nil + } + out := new(ExtenderFilterResult) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderManagedResource) DeepCopyInto(out *ExtenderManagedResource) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderManagedResource. +func (in *ExtenderManagedResource) DeepCopy() *ExtenderManagedResource { + if in == nil { + return nil + } + out := new(ExtenderManagedResource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderPreemptionArgs) DeepCopyInto(out *ExtenderPreemptionArgs) { + *out = *in + if in.Pod != nil { + in, out := &in.Pod, &out.Pod + *out = new(corev1.Pod) + (*in).DeepCopyInto(*out) + } + if in.NodeNameToVictims != nil { + in, out := &in.NodeNameToVictims, &out.NodeNameToVictims + *out = make(map[string]*Victims, len(*in)) + for key, val := range *in { + var outVal *Victims + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(Victims) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } + if in.NodeNameToMetaVictims != nil { + in, out := &in.NodeNameToMetaVictims, &out.NodeNameToMetaVictims + *out = make(map[string]*MetaVictims, len(*in)) + for key, val := range *in { + var outVal *MetaVictims + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(MetaVictims) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderPreemptionArgs. +func (in *ExtenderPreemptionArgs) DeepCopy() *ExtenderPreemptionArgs { + if in == nil { + return nil + } + out := new(ExtenderPreemptionArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderPreemptionResult) DeepCopyInto(out *ExtenderPreemptionResult) { + *out = *in + if in.NodeNameToMetaVictims != nil { + in, out := &in.NodeNameToMetaVictims, &out.NodeNameToMetaVictims + *out = make(map[string]*MetaVictims, len(*in)) + for key, val := range *in { + var outVal *MetaVictims + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(MetaVictims) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderPreemptionResult. +func (in *ExtenderPreemptionResult) DeepCopy() *ExtenderPreemptionResult { + if in == nil { + return nil + } + out := new(ExtenderPreemptionResult) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtenderTLSConfig) DeepCopyInto(out *ExtenderTLSConfig) { + *out = *in + if in.CertData != nil { + in, out := &in.CertData, &out.CertData + *out = make([]byte, len(*in)) + copy(*out, *in) + } + if in.KeyData != nil { + in, out := &in.KeyData, &out.KeyData + *out = make([]byte, len(*in)) + copy(*out, *in) + } + if in.CAData != nil { + in, out := &in.CAData, &out.CAData + *out = make([]byte, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderTLSConfig. +func (in *ExtenderTLSConfig) DeepCopy() *ExtenderTLSConfig { + if in == nil { + return nil + } + out := new(ExtenderTLSConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in FailedNodesMap) DeepCopyInto(out *FailedNodesMap) { + { + in := &in + *out = make(FailedNodesMap, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + return + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedNodesMap. +func (in FailedNodesMap) DeepCopy() FailedNodesMap { + if in == nil { + return nil + } + out := new(FailedNodesMap) + in.DeepCopyInto(out) + return *out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HostPriority) DeepCopyInto(out *HostPriority) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HostPriority. +func (in *HostPriority) DeepCopy() *HostPriority { + if in == nil { + return nil + } + out := new(HostPriority) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in HostPriorityList) DeepCopyInto(out *HostPriorityList) { + { + in := &in + *out = make(HostPriorityList, len(*in)) + copy(*out, *in) + return + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HostPriorityList. +func (in HostPriorityList) DeepCopy() HostPriorityList { + if in == nil { + return nil + } + out := new(HostPriorityList) + in.DeepCopyInto(out) + return *out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LabelPreference) DeepCopyInto(out *LabelPreference) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LabelPreference. +func (in *LabelPreference) DeepCopy() *LabelPreference { + if in == nil { + return nil + } + out := new(LabelPreference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LabelsPresence) DeepCopyInto(out *LabelsPresence) { + *out = *in + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LabelsPresence. +func (in *LabelsPresence) DeepCopy() *LabelsPresence { + if in == nil { + return nil + } + out := new(LabelsPresence) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetaPod) DeepCopyInto(out *MetaPod) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetaPod. +func (in *MetaPod) DeepCopy() *MetaPod { + if in == nil { + return nil + } + out := new(MetaPod) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetaVictims) DeepCopyInto(out *MetaVictims) { + *out = *in + if in.Pods != nil { + in, out := &in.Pods, &out.Pods + *out = make([]*MetaPod, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(MetaPod) + **out = **in + } + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetaVictims. +func (in *MetaVictims) DeepCopy() *MetaVictims { + if in == nil { + return nil + } + out := new(MetaVictims) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Policy) DeepCopyInto(out *Policy) { + *out = *in + out.TypeMeta = in.TypeMeta + if in.Predicates != nil { + in, out := &in.Predicates, &out.Predicates + *out = make([]PredicatePolicy, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Priorities != nil { + in, out := &in.Priorities, &out.Priorities + *out = make([]PriorityPolicy, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.ExtenderConfigs != nil { + in, out := &in.ExtenderConfigs, &out.ExtenderConfigs + *out = make([]ExtenderConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Policy. +func (in *Policy) DeepCopy() *Policy { + if in == nil { + return nil + } + out := new(Policy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Policy) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PredicateArgument) DeepCopyInto(out *PredicateArgument) { + *out = *in + if in.ServiceAffinity != nil { + in, out := &in.ServiceAffinity, &out.ServiceAffinity + *out = new(ServiceAffinity) + (*in).DeepCopyInto(*out) + } + if in.LabelsPresence != nil { + in, out := &in.LabelsPresence, &out.LabelsPresence + *out = new(LabelsPresence) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PredicateArgument. +func (in *PredicateArgument) DeepCopy() *PredicateArgument { + if in == nil { + return nil + } + out := new(PredicateArgument) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PredicatePolicy) DeepCopyInto(out *PredicatePolicy) { + *out = *in + if in.Argument != nil { + in, out := &in.Argument, &out.Argument + *out = new(PredicateArgument) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PredicatePolicy. +func (in *PredicatePolicy) DeepCopy() *PredicatePolicy { + if in == nil { + return nil + } + out := new(PredicatePolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PriorityArgument) DeepCopyInto(out *PriorityArgument) { + *out = *in + if in.ServiceAntiAffinity != nil { + in, out := &in.ServiceAntiAffinity, &out.ServiceAntiAffinity + *out = new(ServiceAntiAffinity) + **out = **in + } + if in.LabelPreference != nil { + in, out := &in.LabelPreference, &out.LabelPreference + *out = new(LabelPreference) + **out = **in + } + if in.RequestedToCapacityRatioArguments != nil { + in, out := &in.RequestedToCapacityRatioArguments, &out.RequestedToCapacityRatioArguments + *out = new(RequestedToCapacityRatioArguments) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PriorityArgument. +func (in *PriorityArgument) DeepCopy() *PriorityArgument { + if in == nil { + return nil + } + out := new(PriorityArgument) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PriorityPolicy) DeepCopyInto(out *PriorityPolicy) { + *out = *in + if in.Argument != nil { + in, out := &in.Argument, &out.Argument + *out = new(PriorityArgument) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PriorityPolicy. +func (in *PriorityPolicy) DeepCopy() *PriorityPolicy { + if in == nil { + return nil + } + out := new(PriorityPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RequestedToCapacityRatioArguments) DeepCopyInto(out *RequestedToCapacityRatioArguments) { + *out = *in + if in.UtilizationShape != nil { + in, out := &in.UtilizationShape, &out.UtilizationShape + *out = make([]UtilizationShapePoint, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RequestedToCapacityRatioArguments. +func (in *RequestedToCapacityRatioArguments) DeepCopy() *RequestedToCapacityRatioArguments { + if in == nil { + return nil + } + out := new(RequestedToCapacityRatioArguments) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceAffinity) DeepCopyInto(out *ServiceAffinity) { + *out = *in + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceAffinity. +func (in *ServiceAffinity) DeepCopy() *ServiceAffinity { + if in == nil { + return nil + } + out := new(ServiceAffinity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceAntiAffinity) DeepCopyInto(out *ServiceAntiAffinity) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceAntiAffinity. +func (in *ServiceAntiAffinity) DeepCopy() *ServiceAntiAffinity { + if in == nil { + return nil + } + out := new(ServiceAntiAffinity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *UtilizationShapePoint) DeepCopyInto(out *UtilizationShapePoint) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UtilizationShapePoint. +func (in *UtilizationShapePoint) DeepCopy() *UtilizationShapePoint { + if in == nil { + return nil + } + out := new(UtilizationShapePoint) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Victims) DeepCopyInto(out *Victims) { + *out = *in + if in.Pods != nil { + in, out := &in.Pods, &out.Pods + *out = make([]*corev1.Pod, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(corev1.Pod) + (*in).DeepCopyInto(*out) + } + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Victims. +func (in *Victims) DeepCopy() *Victims { + if in == nil { + return nil + } + out := new(Victims) + in.DeepCopyInto(out) + return out +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/types.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/types.go new file mode 100644 index 000000000000..f855b7c34af5 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/types.go @@ -0,0 +1,137 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + componentbaseconfig "k8s.io/component-base/config" +) + +const ( + // SchedulerDefaultLockObjectNamespace defines default scheduler lock object namespace ("kube-system") + SchedulerDefaultLockObjectNamespace string = metav1.NamespaceSystem + + // SchedulerDefaultLockObjectName defines default scheduler lock object name ("kube-scheduler") + SchedulerDefaultLockObjectName = "kube-scheduler" + + // SchedulerPolicyConfigMapKey defines the key of the element in the + // scheduler's policy ConfigMap that contains scheduler's policy config. + SchedulerPolicyConfigMapKey = "policy.cfg" + + // SchedulerDefaultProviderName defines the default provider names + SchedulerDefaultProviderName = "DefaultProvider" +) + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// KubeSchedulerConfiguration configures a scheduler +type KubeSchedulerConfiguration struct { + metav1.TypeMeta + + // SchedulerName is name of the scheduler, used to select which pods + // will be processed by this scheduler, based on pod's "spec.SchedulerName". + SchedulerName string + // AlgorithmSource specifies the scheduler algorithm source. + AlgorithmSource SchedulerAlgorithmSource + // RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule + // corresponding to every RequiredDuringScheduling affinity rule. + // HardPodAffinitySymmetricWeight represents the weight of implicit PreferredDuringScheduling affinity rule, in the range 0-100. + HardPodAffinitySymmetricWeight int32 + + // LeaderElection defines the configuration of leader election client. + LeaderElection KubeSchedulerLeaderElectionConfiguration + + // ClientConnection specifies the kubeconfig file and client connection + // settings for the proxy server to use when communicating with the apiserver. + ClientConnection componentbaseconfig.ClientConnectionConfiguration + // HealthzBindAddress is the IP address and port for the health check server to serve on, + // defaulting to 0.0.0.0:10251 + HealthzBindAddress string + // MetricsBindAddress is the IP address and port for the metrics server to + // serve on, defaulting to 0.0.0.0:10251. + MetricsBindAddress string + + // DebuggingConfiguration holds configuration for Debugging related features + // TODO: We might wanna make this a substruct like Debugging componentbaseconfig.DebuggingConfiguration + componentbaseconfig.DebuggingConfiguration + + // DisablePreemption disables the pod preemption feature. + DisablePreemption bool + + // PercentageOfNodeToScore is the percentage of all nodes that once found feasible + // for running a pod, the scheduler stops its search for more feasible nodes in + // the cluster. This helps improve scheduler's performance. Scheduler always tries to find + // at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is. + // Example: if the cluster size is 500 nodes and the value of this flag is 30, + // then scheduler stops finding further feasible nodes once it finds 150 feasible ones. + // When the value is 0, default percentage (5%--50% based on the size of the cluster) of the + // nodes will be scored. + PercentageOfNodesToScore int32 + + // DEPRECATED. + // Indicate the "all topologies" set for empty topologyKey when it's used for PreferredDuringScheduling pod anti-affinity. + FailureDomains string + + // Duration to wait for a binding operation to complete before timing out + // Value must be non-negative integer. The value zero indicates no waiting. + // If this value is nil, the default value will be used. + BindTimeoutSeconds *int64 +} + +// SchedulerAlgorithmSource is the source of a scheduler algorithm. One source +// field must be specified, and source fields are mutually exclusive. +type SchedulerAlgorithmSource struct { + // Policy is a policy based algorithm source. + Policy *SchedulerPolicySource + // Provider is the name of a scheduling algorithm provider to use. + Provider *string +} + +// SchedulerPolicySource configures a means to obtain a scheduler Policy. One +// source field must be specified, and source fields are mutually exclusive. +type SchedulerPolicySource struct { + // File is a file policy source. + File *SchedulerPolicyFileSource + // ConfigMap is a config map policy source. + ConfigMap *SchedulerPolicyConfigMapSource +} + +// SchedulerPolicyFileSource is a policy serialized to disk and accessed via +// path. +type SchedulerPolicyFileSource struct { + // Path is the location of a serialized policy. + Path string +} + +// SchedulerPolicyConfigMapSource is a policy serialized into a config map value +// under the SchedulerPolicyConfigMapKey key. +type SchedulerPolicyConfigMapSource struct { + // Namespace is the namespace of the policy config map. + Namespace string + // Name is the name of hte policy config map. + Name string +} + +// KubeSchedulerLeaderElectionConfiguration expands LeaderElectionConfiguration +// to include scheduler specific configuration. +type KubeSchedulerLeaderElectionConfiguration struct { + componentbaseconfig.LeaderElectionConfiguration + // LockObjectNamespace defines the namespace of the lock object + LockObjectNamespace string + // LockObjectName defines the lock object name + LockObjectName string +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/eventhandlers.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/eventhandlers.go new file mode 100644 index 000000000000..4cf52f381e25 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/eventhandlers.go @@ -0,0 +1,475 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + "k8s.io/klog" + "reflect" + + "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + appsinformers "k8s.io/client-go/informers/apps/v1" + coreinformers "k8s.io/client-go/informers/core/v1" + policyinformers "k8s.io/client-go/informers/policy/v1beta1" + storageinformers "k8s.io/client-go/informers/storage/v1" + "k8s.io/client-go/tools/cache" +) + +func (sched *Scheduler) onPvAdd(obj interface{}) { + // Pods created when there are no PVs available will be stuck in + // unschedulable queue. But unbound PVs created for static provisioning and + // delay binding storage class are skipped in PV controller dynamic + // provisioning and binding process, will not trigger events to schedule pod + // again. So we need to move pods to active queue on PV add for this + // scenario. + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onPvUpdate(old, new interface{}) { + // Scheduler.bindVolumesWorker may fail to update assumed pod volume + // bindings due to conflicts if PVs are updated by PV controller or other + // parties, then scheduler will add pod back to unschedulable queue. We + // need to move pods to active queue on PV update for this scenario. + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onPvcAdd(obj interface{}) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onPvcUpdate(old, new interface{}) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onStorageClassAdd(obj interface{}) { + sc, ok := obj.(*storagev1.StorageClass) + if !ok { + klog.Errorf("cannot convert to *storagev1.StorageClass: %v", obj) + return + } + + // CheckVolumeBindingPred fails if pod has unbound immediate PVCs. If these + // PVCs have specified StorageClass name, creating StorageClass objects + // with late binding will cause predicates to pass, so we need to move pods + // to active queue. + // We don't need to invalidate cached results because results will not be + // cached for pod that has unbound immediate PVCs. + if sc.VolumeBindingMode != nil && *sc.VolumeBindingMode == storagev1.VolumeBindingWaitForFirstConsumer { + sched.config.SchedulingQueue.MoveAllToActiveQueue() + } +} + +func (sched *Scheduler) onServiceAdd(obj interface{}) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onServiceUpdate(oldObj interface{}, newObj interface{}) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) onServiceDelete(obj interface{}) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) addNodeToCache(obj interface{}) { + node, ok := obj.(*v1.Node) + if !ok { + klog.Errorf("cannot convert to *v1.Node: %v", obj) + return + } + + if err := sched.config.SchedulerCache.AddNode(node); err != nil { + klog.Errorf("scheduler cache AddNode failed: %v", err) + } + + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +func (sched *Scheduler) updateNodeInCache(oldObj, newObj interface{}) { + oldNode, ok := oldObj.(*v1.Node) + if !ok { + klog.Errorf("cannot convert oldObj to *v1.Node: %v", oldObj) + return + } + newNode, ok := newObj.(*v1.Node) + if !ok { + klog.Errorf("cannot convert newObj to *v1.Node: %v", newObj) + return + } + + if err := sched.config.SchedulerCache.UpdateNode(oldNode, newNode); err != nil { + klog.Errorf("scheduler cache UpdateNode failed: %v", err) + } + + // Only activate unschedulable pods if the node became more schedulable. + // We skip the node property comparison when there is no unschedulable pods in the queue + // to save processing cycles. We still trigger a move to active queue to cover the case + // that a pod being processed by the scheduler is determined unschedulable. We want this + // pod to be reevaluated when a change in the cluster happens. + if sched.config.SchedulingQueue.NumUnschedulablePods() == 0 || nodeSchedulingPropertiesChanged(newNode, oldNode) { + sched.config.SchedulingQueue.MoveAllToActiveQueue() + } +} + +func (sched *Scheduler) deleteNodeFromCache(obj interface{}) { + var node *v1.Node + switch t := obj.(type) { + case *v1.Node: + node = t + case cache.DeletedFinalStateUnknown: + var ok bool + node, ok = t.Obj.(*v1.Node) + if !ok { + klog.Errorf("cannot convert to *v1.Node: %v", t.Obj) + return + } + default: + klog.Errorf("cannot convert to *v1.Node: %v", t) + return + } + // NOTE: Updates must be written to scheduler cache before invalidating + // equivalence cache, because we could snapshot equivalence cache after the + // invalidation and then snapshot the cache itself. If the cache is + // snapshotted before updates are written, we would update equivalence + // cache with stale information which is based on snapshot of old cache. + if err := sched.config.SchedulerCache.RemoveNode(node); err != nil { + klog.Errorf("scheduler cache RemoveNode failed: %v", err) + } +} +func (sched *Scheduler) addPodToSchedulingQueue(obj interface{}) { + if err := sched.config.SchedulingQueue.Add(obj.(*v1.Pod)); err != nil { + utilruntime.HandleError(fmt.Errorf("unable to queue %T: %v", obj, err)) + } +} + +func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) { + pod := newObj.(*v1.Pod) + if sched.skipPodUpdate(pod) { + return + } + if err := sched.config.SchedulingQueue.Update(oldObj.(*v1.Pod), pod); err != nil { + utilruntime.HandleError(fmt.Errorf("unable to update %T: %v", newObj, err)) + } +} + +func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) { + var pod *v1.Pod + switch t := obj.(type) { + case *v1.Pod: + pod = obj.(*v1.Pod) + case cache.DeletedFinalStateUnknown: + var ok bool + pod, ok = t.Obj.(*v1.Pod) + if !ok { + utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched)) + return + } + default: + utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj)) + return + } + if err := sched.config.SchedulingQueue.Delete(pod); err != nil { + utilruntime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err)) + } + if sched.config.VolumeBinder != nil { + // Volume binder only wants to keep unassigned pods + sched.config.VolumeBinder.DeletePodBindings(pod) + } +} + +func (sched *Scheduler) addPodToCache(obj interface{}) { + pod, ok := obj.(*v1.Pod) + if !ok { + klog.Errorf("cannot convert to *v1.Pod: %v", obj) + return + } + + if err := sched.config.SchedulerCache.AddPod(pod); err != nil { + klog.Errorf("scheduler cache AddPod failed: %v", err) + } + + sched.config.SchedulingQueue.AssignedPodAdded(pod) +} + +func (sched *Scheduler) updatePodInCache(oldObj, newObj interface{}) { + oldPod, ok := oldObj.(*v1.Pod) + if !ok { + klog.Errorf("cannot convert oldObj to *v1.Pod: %v", oldObj) + return + } + newPod, ok := newObj.(*v1.Pod) + if !ok { + klog.Errorf("cannot convert newObj to *v1.Pod: %v", newObj) + return + } + + // NOTE: Updates must be written to scheduler cache before invalidating + // equivalence cache, because we could snapshot equivalence cache after the + // invalidation and then snapshot the cache itself. If the cache is + // snapshotted before updates are written, we would update equivalence + // cache with stale information which is based on snapshot of old cache. + if err := sched.config.SchedulerCache.UpdatePod(oldPod, newPod); err != nil { + klog.Errorf("scheduler cache UpdatePod failed: %v", err) + } + + sched.config.SchedulingQueue.AssignedPodUpdated(newPod) +} + +func (sched *Scheduler) deletePodFromCache(obj interface{}) { + var pod *v1.Pod + switch t := obj.(type) { + case *v1.Pod: + pod = t + case cache.DeletedFinalStateUnknown: + var ok bool + pod, ok = t.Obj.(*v1.Pod) + if !ok { + klog.Errorf("cannot convert to *v1.Pod: %v", t.Obj) + return + } + default: + klog.Errorf("cannot convert to *v1.Pod: %v", t) + return + } + // NOTE: Updates must be written to scheduler cache before invalidating + // equivalence cache, because we could snapshot equivalence cache after the + // invalidation and then snapshot the cache itself. If the cache is + // snapshotted before updates are written, we would update equivalence + // cache with stale information which is based on snapshot of old cache. + if err := sched.config.SchedulerCache.RemovePod(pod); err != nil { + klog.Errorf("scheduler cache RemovePod failed: %v", err) + } + + sched.config.SchedulingQueue.MoveAllToActiveQueue() +} + +// assignedPod selects pods that are assigned (scheduled and running). +func assignedPod(pod *v1.Pod) bool { + return len(pod.Spec.NodeName) != 0 +} + +// responsibleForPod returns true if the pod has asked to be scheduled by the given scheduler. +func responsibleForPod(pod *v1.Pod, schedulerName string) bool { + return schedulerName == pod.Spec.SchedulerName +} + +// skipPodUpdate checks whether the specified pod update should be ignored. +// This function will return true if +// - The pod has already been assumed, AND +// - The pod has only its ResourceVersion, Spec.NodeName and/or Annotations +// updated. +func (sched *Scheduler) skipPodUpdate(pod *v1.Pod) bool { + // Non-assumed pods should never be skipped. + isAssumed, err := sched.config.SchedulerCache.IsAssumedPod(pod) + if err != nil { + utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err)) + return false + } + if !isAssumed { + return false + } + + // Gets the assumed pod from the cache. + assumedPod, err := sched.config.SchedulerCache.GetPod(pod) + if err != nil { + utilruntime.HandleError(fmt.Errorf("failed to get assumed pod %s/%s from cache: %v", pod.Namespace, pod.Name, err)) + return false + } + + // Compares the assumed pod in the cache with the pod update. If they are + // equal (with certain fields excluded), this pod update will be skipped. + f := func(pod *v1.Pod) *v1.Pod { + p := pod.DeepCopy() + // ResourceVersion must be excluded because each object update will + // have a new resource version. + p.ResourceVersion = "" + // Spec.NodeName must be excluded because the pod assumed in the cache + // is expected to have a node assigned while the pod update may nor may + // not have this field set. + p.Spec.NodeName = "" + // Annotations must be excluded for the reasons described in + // https://github.com/kubernetes/kubernetes/issues/52914. + p.Annotations = nil + return p + } + assumedPodCopy, podCopy := f(assumedPod), f(pod) + if !reflect.DeepEqual(assumedPodCopy, podCopy) { + return false + } + klog.V(3).Infof("Skipping pod %s/%s update", pod.Namespace, pod.Name) + return true +} + +// AddAllEventHandlers is a helper function used in tests and in Scheduler +// to add event handlers for various informers. +func AddAllEventHandlers( + sched *Scheduler, + schedulerName string, + nodeInformer coreinformers.NodeInformer, + podInformer coreinformers.PodInformer, + pvInformer coreinformers.PersistentVolumeInformer, + pvcInformer coreinformers.PersistentVolumeClaimInformer, + replicationControllerInformer coreinformers.ReplicationControllerInformer, + replicaSetInformer appsinformers.ReplicaSetInformer, + statefulSetInformer appsinformers.StatefulSetInformer, + serviceInformer coreinformers.ServiceInformer, + pdbInformer policyinformers.PodDisruptionBudgetInformer, + storageClassInformer storageinformers.StorageClassInformer, +) { + // scheduled pod cache + podInformer.Informer().AddEventHandler( + cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch t := obj.(type) { + case *v1.Pod: + return assignedPod(t) + case cache.DeletedFinalStateUnknown: + if pod, ok := t.Obj.(*v1.Pod); ok { + return assignedPod(pod) + } + utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched)) + return false + default: + utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj)) + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: sched.addPodToCache, + UpdateFunc: sched.updatePodInCache, + DeleteFunc: sched.deletePodFromCache, + }, + }, + ) + // unscheduled pod queue + podInformer.Informer().AddEventHandler( + cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch t := obj.(type) { + case *v1.Pod: + return !assignedPod(t) && responsibleForPod(t, schedulerName) + case cache.DeletedFinalStateUnknown: + if pod, ok := t.Obj.(*v1.Pod); ok { + return !assignedPod(pod) && responsibleForPod(pod, schedulerName) + } + utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched)) + return false + default: + utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj)) + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: sched.addPodToSchedulingQueue, + UpdateFunc: sched.updatePodInSchedulingQueue, + DeleteFunc: sched.deletePodFromSchedulingQueue, + }, + }, + ) + + nodeInformer.Informer().AddEventHandler( + cache.ResourceEventHandlerFuncs{ + AddFunc: sched.addNodeToCache, + UpdateFunc: sched.updateNodeInCache, + DeleteFunc: sched.deleteNodeFromCache, + }, + ) + + // On add and delete of PVs, it will affect equivalence cache items + // related to persistent volume + pvInformer.Informer().AddEventHandler( + cache.ResourceEventHandlerFuncs{ + // MaxPDVolumeCountPredicate: since it relies on the counts of PV. + AddFunc: sched.onPvAdd, + UpdateFunc: sched.onPvUpdate, + }, + ) + + // This is for MaxPDVolumeCountPredicate: add/delete PVC will affect counts of PV when it is bound. + pvcInformer.Informer().AddEventHandler( + cache.ResourceEventHandlerFuncs{ + AddFunc: sched.onPvcAdd, + UpdateFunc: sched.onPvcUpdate, + }, + ) + + // This is for ServiceAffinity: affected by the selector of the service is updated. + // Also, if new service is added, equivalence cache will also become invalid since + // existing pods may be "captured" by this service and change this predicate result. + serviceInformer.Informer().AddEventHandler( + cache.ResourceEventHandlerFuncs{ + AddFunc: sched.onServiceAdd, + UpdateFunc: sched.onServiceUpdate, + DeleteFunc: sched.onServiceDelete, + }, + ) + + storageClassInformer.Informer().AddEventHandler( + cache.ResourceEventHandlerFuncs{ + AddFunc: sched.onStorageClassAdd, + }, + ) +} + +func nodeSchedulingPropertiesChanged(newNode *v1.Node, oldNode *v1.Node) bool { + if nodeSpecUnschedulableChanged(newNode, oldNode) { + return true + } + if nodeAllocatableChanged(newNode, oldNode) { + return true + } + if nodeLabelsChanged(newNode, oldNode) { + return true + } + if nodeTaintsChanged(newNode, oldNode) { + return true + } + if nodeConditionsChanged(newNode, oldNode) { + return true + } + + return false +} + +func nodeAllocatableChanged(newNode *v1.Node, oldNode *v1.Node) bool { + return !reflect.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) +} + +func nodeLabelsChanged(newNode *v1.Node, oldNode *v1.Node) bool { + return !reflect.DeepEqual(oldNode.GetLabels(), newNode.GetLabels()) +} + +func nodeTaintsChanged(newNode *v1.Node, oldNode *v1.Node) bool { + return !reflect.DeepEqual(newNode.Spec.Taints, oldNode.Spec.Taints) +} + +func nodeConditionsChanged(newNode *v1.Node, oldNode *v1.Node) bool { + strip := func(conditions []v1.NodeCondition) map[v1.NodeConditionType]v1.ConditionStatus { + conditionStatuses := make(map[v1.NodeConditionType]v1.ConditionStatus, len(conditions)) + for i := range conditions { + conditionStatuses[conditions[i].Type] = conditions[i].Status + } + return conditionStatuses + } + return !reflect.DeepEqual(strip(oldNode.Status.Conditions), strip(newNode.Status.Conditions)) +} + +func nodeSpecUnschedulableChanged(newNode *v1.Node, oldNode *v1.Node) bool { + return newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable && newNode.Spec.Unschedulable == false +} diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/scheduler.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/scheduler.go new file mode 100644 index 000000000000..056805929f81 --- /dev/null +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/scheduler.go @@ -0,0 +1,576 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "time" + + "k8s.io/klog" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/wait" + appsinformers "k8s.io/client-go/informers/apps/v1" + coreinformers "k8s.io/client-go/informers/core/v1" + policyinformers "k8s.io/client-go/informers/policy/v1beta1" + storageinformers "k8s.io/client-go/informers/storage/v1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + latestschedulerapi "k8s.io/kubernetes/pkg/scheduler/api/latest" + kubeschedulerconfig "k8s.io/kubernetes/pkg/scheduler/apis/config" + "k8s.io/kubernetes/pkg/scheduler/core" + "k8s.io/kubernetes/pkg/scheduler/factory" + schedulerinternalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" + "k8s.io/kubernetes/pkg/scheduler/metrics" + "k8s.io/kubernetes/pkg/scheduler/util" +) + +const ( + // BindTimeoutSeconds defines the default bind timeout + BindTimeoutSeconds = 100 + // SchedulerError is the reason recorded for events when an error occurs during scheduling a pod. + SchedulerError = "SchedulerError" +) + +// Scheduler watches for new unscheduled pods. It attempts to find +// nodes that they fit on and writes bindings back to the api server. +type Scheduler struct { + config *factory.Config +} + +// Cache returns the cache in scheduler for test to check the data in scheduler. +func (sched *Scheduler) Cache() schedulerinternalcache.Cache { + return sched.config.SchedulerCache +} + +type schedulerOptions struct { + schedulerName string + hardPodAffinitySymmetricWeight int32 + disablePreemption bool + percentageOfNodesToScore int32 + bindTimeoutSeconds int64 +} + +// Option configures a Scheduler +type Option func(*schedulerOptions) + +// WithName sets schedulerName for Scheduler, the default schedulerName is default-scheduler +func WithName(schedulerName string) Option { + return func(o *schedulerOptions) { + o.schedulerName = schedulerName + } +} + +// WithHardPodAffinitySymmetricWeight sets hardPodAffinitySymmetricWeight for Scheduler, the default value is 1 +func WithHardPodAffinitySymmetricWeight(hardPodAffinitySymmetricWeight int32) Option { + return func(o *schedulerOptions) { + o.hardPodAffinitySymmetricWeight = hardPodAffinitySymmetricWeight + } +} + +// WithPreemptionDisabled sets disablePreemption for Scheduler, the default value is false +func WithPreemptionDisabled(disablePreemption bool) Option { + return func(o *schedulerOptions) { + o.disablePreemption = disablePreemption + } +} + +// WithPercentageOfNodesToScore sets percentageOfNodesToScore for Scheduler, the default value is 50 +func WithPercentageOfNodesToScore(percentageOfNodesToScore int32) Option { + return func(o *schedulerOptions) { + o.percentageOfNodesToScore = percentageOfNodesToScore + } +} + +// WithBindTimeoutSeconds sets bindTimeoutSeconds for Scheduler, the default value is 100 +func WithBindTimeoutSeconds(bindTimeoutSeconds int64) Option { + return func(o *schedulerOptions) { + o.bindTimeoutSeconds = bindTimeoutSeconds + } +} + +var defaultSchedulerOptions = schedulerOptions{ + schedulerName: v1.DefaultSchedulerName, + hardPodAffinitySymmetricWeight: v1.DefaultHardPodAffinitySymmetricWeight, + disablePreemption: false, + percentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore, + bindTimeoutSeconds: BindTimeoutSeconds, +} + +// New returns a Scheduler +func New(client clientset.Interface, + nodeInformer coreinformers.NodeInformer, + podInformer coreinformers.PodInformer, + pvInformer coreinformers.PersistentVolumeInformer, + pvcInformer coreinformers.PersistentVolumeClaimInformer, + replicationControllerInformer coreinformers.ReplicationControllerInformer, + replicaSetInformer appsinformers.ReplicaSetInformer, + statefulSetInformer appsinformers.StatefulSetInformer, + serviceInformer coreinformers.ServiceInformer, + pdbInformer policyinformers.PodDisruptionBudgetInformer, + storageClassInformer storageinformers.StorageClassInformer, + recorder record.EventRecorder, + schedulerAlgorithmSource kubeschedulerconfig.SchedulerAlgorithmSource, + stopCh <-chan struct{}, + opts ...func(o *schedulerOptions)) (*Scheduler, error) { + + options := defaultSchedulerOptions + for _, opt := range opts { + opt(&options) + } + // Set up the configurator which can create schedulers from configs. + configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{ + SchedulerName: options.schedulerName, + Client: client, + NodeInformer: nodeInformer, + PodInformer: podInformer, + PvInformer: pvInformer, + PvcInformer: pvcInformer, + ReplicationControllerInformer: replicationControllerInformer, + ReplicaSetInformer: replicaSetInformer, + StatefulSetInformer: statefulSetInformer, + ServiceInformer: serviceInformer, + PdbInformer: pdbInformer, + StorageClassInformer: storageClassInformer, + HardPodAffinitySymmetricWeight: options.hardPodAffinitySymmetricWeight, + DisablePreemption: options.disablePreemption, + PercentageOfNodesToScore: options.percentageOfNodesToScore, + BindTimeoutSeconds: options.bindTimeoutSeconds, + }) + var config *factory.Config + source := schedulerAlgorithmSource + switch { + case source.Provider != nil: + // Create the config from a named algorithm provider. + sc, err := configurator.CreateFromProvider(*source.Provider) + if err != nil { + return nil, fmt.Errorf("couldn't create scheduler using provider %q: %v", *source.Provider, err) + } + config = sc + case source.Policy != nil: + // Create the config from a user specified policy source. + policy := &schedulerapi.Policy{} + switch { + case source.Policy.File != nil: + if err := initPolicyFromFile(source.Policy.File.Path, policy); err != nil { + return nil, err + } + case source.Policy.ConfigMap != nil: + if err := initPolicyFromConfigMap(client, source.Policy.ConfigMap, policy); err != nil { + return nil, err + } + } + sc, err := configurator.CreateFromConfig(*policy) + if err != nil { + return nil, fmt.Errorf("couldn't create scheduler from policy: %v", err) + } + config = sc + default: + return nil, fmt.Errorf("unsupported algorithm source: %v", source) + } + // Additional tweaks to the config produced by the configurator. + config.Recorder = recorder + config.DisablePreemption = options.disablePreemption + config.StopEverything = stopCh + + // Create the scheduler. + sched := NewFromConfig(config) + + AddAllEventHandlers(sched, options.schedulerName, nodeInformer, podInformer, pvInformer, pvcInformer, replicationControllerInformer, replicaSetInformer, statefulSetInformer, serviceInformer, pdbInformer, storageClassInformer) + return sched, nil +} + +// initPolicyFromFile initialize policy from file +func initPolicyFromFile(policyFile string, policy *schedulerapi.Policy) error { + // Use a policy serialized in a file. + _, err := os.Stat(policyFile) + if err != nil { + return fmt.Errorf("missing policy config file %s", policyFile) + } + data, err := ioutil.ReadFile(policyFile) + if err != nil { + return fmt.Errorf("couldn't read policy config: %v", err) + } + err = runtime.DecodeInto(latestschedulerapi.Codec, []byte(data), policy) + if err != nil { + return fmt.Errorf("invalid policy: %v", err) + } + return nil +} + +// initPolicyFromConfigMap initialize policy from configMap +func initPolicyFromConfigMap(client clientset.Interface, policyRef *kubeschedulerconfig.SchedulerPolicyConfigMapSource, policy *schedulerapi.Policy) error { + // Use a policy serialized in a config map value. + policyConfigMap, err := client.CoreV1().ConfigMaps(policyRef.Namespace).Get(policyRef.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("couldn't get policy config map %s/%s: %v", policyRef.Namespace, policyRef.Name, err) + } + data, found := policyConfigMap.Data[kubeschedulerconfig.SchedulerPolicyConfigMapKey] + if !found { + return fmt.Errorf("missing policy config map value at key %q", kubeschedulerconfig.SchedulerPolicyConfigMapKey) + } + err = runtime.DecodeInto(latestschedulerapi.Codec, []byte(data), policy) + if err != nil { + return fmt.Errorf("invalid policy: %v", err) + } + return nil +} + +// NewFromConfig returns a new scheduler using the provided Config. +func NewFromConfig(config *factory.Config) *Scheduler { + metrics.Register() + return &Scheduler{ + config: config, + } +} + +// Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately. +func (sched *Scheduler) Run() { + if !sched.config.WaitForCacheSync() { + return + } + + go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything) +} + +// Config returns scheduler's config pointer. It is exposed for testing purposes. +func (sched *Scheduler) Config() *factory.Config { + return sched.config +} + +// recordFailedSchedulingEvent records an event for the pod that indicates the +// pod has failed to schedule. +// NOTE: This function modifies "pod". "pod" should be copied before being passed. +func (sched *Scheduler) recordSchedulingFailure(pod *v1.Pod, err error, reason string, message string) { + sched.config.Error(pod, err) + sched.config.Recorder.Event(pod, v1.EventTypeWarning, "FailedScheduling", message) + sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: reason, + Message: err.Error(), + }) +} + +// schedule implements the scheduling algorithm and returns the suggested result(host, +// evaluated nodes number,feasible nodes number). +func (sched *Scheduler) schedule(pod *v1.Pod) (core.ScheduleResult, error) { + result, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister) + if err != nil { + pod = pod.DeepCopy() + sched.recordSchedulingFailure(pod, err, v1.PodReasonUnschedulable, err.Error()) + return core.ScheduleResult{}, err + } + return result, err +} + +// preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible. +// If it succeeds, it adds the name of the node where preemption has happened to the pod annotations. +// It returns the node name and an error if any. +func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) { + preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor) + if err != nil { + klog.Errorf("Error getting the updated preemptor pod object: %v", err) + return "", err + } + + node, victims, nominatedPodsToClear, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr) + if err != nil { + klog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name) + return "", err + } + var nodeName = "" + if node != nil { + nodeName = node.Name + // Update the scheduling queue with the nominated pod information. Without + // this, there would be a race condition between the next scheduling cycle + // and the time the scheduler receives a Pod Update for the nominated pod. + sched.config.SchedulingQueue.UpdateNominatedPodForNode(preemptor, nodeName) + + // Make a call to update nominated node name of the pod on the API server. + err = sched.config.PodPreemptor.SetNominatedNodeName(preemptor, nodeName) + if err != nil { + klog.Errorf("Error in preemption process. Cannot update pod %v/%v annotations: %v", preemptor.Namespace, preemptor.Name, err) + sched.config.SchedulingQueue.DeleteNominatedPodIfExists(preemptor) + return "", err + } + + for _, victim := range victims { + if err := sched.config.PodPreemptor.DeletePod(victim); err != nil { + klog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err) + return "", err + } + sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, nodeName) + } + metrics.PreemptionVictims.Set(float64(len(victims))) + } + // Clearing nominated pods should happen outside of "if node != nil". Node could + // be nil when a pod with nominated node name is eligible to preempt again, + // but preemption logic does not find any node for it. In that case Preempt() + // function of generic_scheduler.go returns the pod itself for removal of the annotation. + for _, p := range nominatedPodsToClear { + rErr := sched.config.PodPreemptor.RemoveNominatedNodeName(p) + if rErr != nil { + klog.Errorf("Cannot remove nominated node annotation of pod: %v", rErr) + // We do not return as this error is not critical. + } + } + return nodeName, err +} + +// assumeVolumes will update the volume cache with the chosen bindings +// +// This function modifies assumed if volume binding is required. +func (sched *Scheduler) assumeVolumes(assumed *v1.Pod, host string) (allBound bool, err error) { + allBound, err = sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host) + if err != nil { + sched.recordSchedulingFailure(assumed, err, SchedulerError, + fmt.Sprintf("AssumePodVolumes failed: %v", err)) + } + return +} + +// bindVolumes will make the API update with the assumed bindings and wait until +// the PV controller has completely finished the binding operation. +// +// If binding errors, times out or gets undone, then an error will be returned to +// retry scheduling. +func (sched *Scheduler) bindVolumes(assumed *v1.Pod) error { + klog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name) + err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed) + if err != nil { + klog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\": %v", assumed.Namespace, assumed.Name, err) + + // Unassume the Pod and retry scheduling + if forgetErr := sched.config.SchedulerCache.ForgetPod(assumed); forgetErr != nil { + klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr) + } + + sched.recordSchedulingFailure(assumed, err, "VolumeBindingFailed", err.Error()) + return err + } + + klog.V(5).Infof("Success binding volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name) + return nil +} + +// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. +// assume modifies `assumed`. +func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { + // Optimistically assume that the binding will succeed and send it to apiserver + // in the background. + // If the binding fails, scheduler will release resources allocated to assumed pod + // immediately. + assumed.Spec.NodeName = host + + if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil { + klog.Errorf("scheduler cache AssumePod failed: %v", err) + + // This is most probably result of a BUG in retrying logic. + // We report an error here so that pod scheduling can be retried. + // This relies on the fact that Error will check if the pod has been bound + // to a node and if so will not add it back to the unscheduled pods queue + // (otherwise this would cause an infinite loop). + sched.recordSchedulingFailure(assumed, err, SchedulerError, + fmt.Sprintf("AssumePod failed: %v", err)) + return err + } + // if "assumed" is a nominated pod, we should remove it from internal cache + if sched.config.SchedulingQueue != nil { + sched.config.SchedulingQueue.DeleteNominatedPodIfExists(assumed) + } + + return nil +} + +// bind binds a pod to a given node defined in a binding object. We expect this to run asynchronously, so we +// handle binding metrics internally. +func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error { + bindingStart := time.Now() + // If binding succeeded then PodScheduled condition will be updated in apiserver so that + // it's atomic with setting host. + err := sched.config.GetBinder(assumed).Bind(b) + if finErr := sched.config.SchedulerCache.FinishBinding(assumed); finErr != nil { + klog.Errorf("scheduler cache FinishBinding failed: %v", finErr) + } + if err != nil { + klog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name) + if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil { + klog.Errorf("scheduler cache ForgetPod failed: %v", err) + } + sched.recordSchedulingFailure(assumed, err, SchedulerError, + fmt.Sprintf("Binding rejected: %v", err)) + return err + } + + metrics.BindingLatency.Observe(metrics.SinceInSeconds(bindingStart)) + metrics.DeprecatedBindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart)) + metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart)) + metrics.DeprecatedSchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart)) + sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name) + return nil +} + +// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. +func (sched *Scheduler) scheduleOne() { + plugins := sched.config.PluginSet + // Remove all plugin context data at the beginning of a scheduling cycle. + if plugins.Data().Ctx != nil { + plugins.Data().Ctx.Reset() + } + + pod := sched.config.NextPod() + // pod could be nil when schedulerQueue is closed + if pod == nil { + return + } + if pod.DeletionTimestamp != nil { + sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) + klog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) + return + } + + klog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name) + + // Synchronously attempt to find a fit for the pod. + start := time.Now() + scheduleResult, err := sched.schedule(pod) + if err != nil { + // schedule() may have failed because the pod would not fit on any host, so we try to + // preempt, with the expectation that the next time the pod is tried for scheduling it + // will fit due to the preemption. It is also possible that a different pod will schedule + // into the resources that were preempted, but this is harmless. + if fitError, ok := err.(*core.FitError); ok { + if !util.PodPriorityEnabled() || sched.config.DisablePreemption { + klog.V(3).Infof("Pod priority feature is not enabled or preemption is disabled by scheduler configuration." + + " No preemption is performed.") + } else { + preemptionStartTime := time.Now() + sched.preempt(pod, fitError) + metrics.PreemptionAttempts.Inc() + metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInSeconds(preemptionStartTime)) + metrics.DeprecatedSchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime)) + metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) + metrics.DeprecatedSchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) + } + // Pod did not fit anywhere, so it is counted as a failure. If preemption + // succeeds, the pod should get counted as a success the next time we try to + // schedule it. (hopefully) + metrics.PodScheduleFailures.Inc() + } else { + klog.Errorf("error selecting node for pod: %v", err) + metrics.PodScheduleErrors.Inc() + } + return + } + metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedSchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start)) + // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. + // This allows us to keep scheduling without waiting on binding to occur. + assumedPod := pod.DeepCopy() + + // Assume volumes first before assuming the pod. + // + // If all volumes are completely bound, then allBound is true and binding will be skipped. + // + // Otherwise, binding of volumes is started after the pod is assumed, but before pod binding. + // + // This function modifies 'assumedPod' if volume binding is required. + allBound, err := sched.assumeVolumes(assumedPod, scheduleResult.SuggestedHost) + if err != nil { + klog.Errorf("error assuming volumes: %v", err) + metrics.PodScheduleErrors.Inc() + return + } + + // Run "reserve" plugins. + for _, pl := range plugins.ReservePlugins() { + if err := pl.Reserve(plugins, assumedPod, scheduleResult.SuggestedHost); err != nil { + klog.Errorf("error while running %v reserve plugin for pod %v: %v", pl.Name(), assumedPod.Name, err) + sched.recordSchedulingFailure(assumedPod, err, SchedulerError, + fmt.Sprintf("reserve plugin %v failed", pl.Name())) + metrics.PodScheduleErrors.Inc() + return + } + } + // assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost + err = sched.assume(assumedPod, scheduleResult.SuggestedHost) + if err != nil { + klog.Errorf("error assuming pod: %v", err) + metrics.PodScheduleErrors.Inc() + return + } + // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). + go func() { + // Bind volumes first before Pod + if !allBound { + err := sched.bindVolumes(assumedPod) + if err != nil { + klog.Errorf("error binding volumes: %v", err) + metrics.PodScheduleErrors.Inc() + return + } + } + + // Run "prebind" plugins. + for _, pl := range plugins.PrebindPlugins() { + approved, err := pl.Prebind(plugins, assumedPod, scheduleResult.SuggestedHost) + if err != nil { + approved = false + klog.Errorf("error while running %v prebind plugin for pod %v: %v", pl.Name(), assumedPod.Name, err) + metrics.PodScheduleErrors.Inc() + } + if !approved { + sched.Cache().ForgetPod(assumedPod) + var reason string + if err == nil { + msg := fmt.Sprintf("prebind plugin %v rejected pod %v.", pl.Name(), assumedPod.Name) + klog.V(4).Infof(msg) + err = errors.New(msg) + reason = v1.PodReasonUnschedulable + } else { + reason = SchedulerError + } + sched.recordSchedulingFailure(assumedPod, err, reason, err.Error()) + return + } + } + + err := sched.bind(assumedPod, &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID}, + Target: v1.ObjectReference{ + Kind: "Node", + Name: scheduleResult.SuggestedHost, + }, + }) + metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedE2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start)) + if err != nil { + klog.Errorf("error binding pod: %v", err) + metrics.PodScheduleErrors.Inc() + } else { + klog.V(2).Infof("pod %v/%v is bound successfully on node %v, %d nodes evaluated, %d nodes were found feasible", assumedPod.Namespace, assumedPod.Name, scheduleResult.SuggestedHost, scheduleResult.EvaluatedNodes, scheduleResult.FeasibleNodes) + metrics.PodScheduleSuccesses.Inc() + } + }() +}