Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revendor CA against tip of k8s.io/kubernetes #2515

Merged
merged 9 commits into from
Nov 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 5 additions & 4 deletions cluster-autoscaler/cloudprovider/gce/gce_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ limitations under the License.
package gce

import (
"github.com/prometheus/client_golang/prometheus"
k8smetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)

const (
Expand All @@ -26,8 +27,8 @@ const (

var (
/**** Metrics related to GCE API usage ****/
requestCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
requestCounter = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "gce_request_count",
Help: "Counter of GCE API requests for each verb and API resource.",
Expand All @@ -37,7 +38,7 @@ var (

// RegisterMetrics registers all GCE metrics.
func RegisterMetrics() {
prometheus.MustRegister(requestCounter)
legacyregistry.MustRegister(requestCounter)
}

// registerRequest registers request to GCE API.
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/gce/templates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ func makeResourceList2(cpu int64, memory int64, gpu int64, pods int64) (apiv1.Re

func assertEqualResourceLists(t *testing.T, name string, expected, actual apiv1.ResourceList) {
t.Helper()
assert.True(t, quota.V1Equals(expected, actual),
assert.True(t, quota.Equals(expected, actual),
"%q unequal:\nExpected: %v\nActual: %v", name, stringifyResourceList(expected), stringifyResourceList(actual))
}

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/filter_out_schedulable.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func filterOutSchedulableByPacking(unschedulableCandidates []*apiv1.Pod, nodes [
nodeNameToNodeInfo := schedulerutil.CreateNodeNameToInfoMap(nonExpendableScheduled, nodes)

sort.Slice(unschedulableCandidates, func(i, j int) bool {
return util.GetPodPriority(unschedulableCandidates[i]) > util.GetPodPriority(unschedulableCandidates[j])
return util.MoreImportantPod(unschedulableCandidates[i], unschedulableCandidates[j])
})

for _, pod := range unschedulableCandidates {
Expand Down
19 changes: 10 additions & 9 deletions cluster-autoscaler/core/utils/taint_key_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package utils

import (
v1 "k8s.io/api/core/v1"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)

Expand All @@ -26,14 +27,14 @@ type TaintKeySet map[string]bool
var (
// NodeConditionTaints lists taint keys used as node conditions
NodeConditionTaints = TaintKeySet{
schedulerapi.TaintNodeNotReady: true,
schedulerapi.TaintNodeUnreachable: true,
schedulerapi.TaintNodeUnschedulable: true,
schedulerapi.TaintNodeMemoryPressure: true,
schedulerapi.TaintNodeDiskPressure: true,
schedulerapi.TaintNodeNetworkUnavailable: true,
schedulerapi.TaintNodePIDPressure: true,
schedulerapi.TaintExternalCloudProvider: true,
schedulerapi.TaintNodeShutdown: true,
v1.TaintNodeNotReady: true,
v1.TaintNodeUnreachable: true,
v1.TaintNodeUnschedulable: true,
v1.TaintNodeMemoryPressure: true,
v1.TaintNodeDiskPressure: true,
v1.TaintNodeNetworkUnavailable: true,
v1.TaintNodePIDPressure: true,
schedulerapi.TaintExternalCloudProvider: true,
schedulerapi.TaintNodeShutdown: true,
}
)
314 changes: 200 additions & 114 deletions cluster-autoscaler/go.mod

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion cluster-autoscaler/go.mod-extra
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ go 1.12

require (
github.com/rancher/go-rancher v0.1.0
github.com/google/go-querystring v1.0.0
github.com/aws/aws-sdk-go v1.23.18
)

Expand Down
375 changes: 230 additions & 145 deletions cluster-autoscaler/go.sum

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"syscall"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/spf13/pflag"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -53,6 +52,7 @@ import (
"k8s.io/client-go/tools/leaderelection/resourcelock"
kube_flag "k8s.io/component-base/cli/flag"
componentbaseconfig "k8s.io/component-base/config"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/client/leaderelectionconfig"
)
Expand Down Expand Up @@ -368,7 +368,7 @@ func main() {
klog.V(1).Infof("Cluster Autoscaler %s", version.ClusterAutoscalerVersion)

go func() {
http.Handle("/metrics", prometheus.Handler())
http.Handle("/metrics", legacyregistry.Handler())
http.Handle("/health-check", healthCheck)
err := http.ListenAndServe(*address, nil)
klog.Fatalf("Failed to start metrics: %v", err)
Expand Down
119 changes: 60 additions & 59 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ import (

"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
_ "k8s.io/kubernetes/pkg/client/metrics/prometheus" // for client-go metrics registration
_ "k8s.io/component-base/metrics/prometheus/restclient" // for client-go metrics registration

"github.com/prometheus/client_golang/prometheus"
k8smetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog"
)

Expand Down Expand Up @@ -92,58 +93,58 @@ const (

var (
/**** Metrics related to cluster state ****/
clusterSafeToAutoscale = prometheus.NewGauge(
prometheus.GaugeOpts{
clusterSafeToAutoscale = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_safe_to_autoscale",
Help: "Whether or not cluster is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
},
)

nodesCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
nodesCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "nodes_count",
Help: "Number of nodes in cluster.",
}, []string{"state"},
)

nodeGroupsCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
nodeGroupsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_groups_count",
Help: "Number of node groups managed by CA.",
}, []string{"node_group_type"},
)

unschedulablePodsCount = prometheus.NewGauge(
prometheus.GaugeOpts{
unschedulablePodsCount = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "unschedulable_pods_count",
Help: "Number of unschedulable pods in the cluster.",
},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "last_activity",
Help: "Last time certain part of CA logic executed.",
}, []string{"activity"},
)

functionDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
functionDuration = k8smetrics.NewHistogramVec(
&k8smetrics.HistogramOpts{
Namespace: caNamespace,
Name: "function_duration_seconds",
Help: "Time taken by various parts of CA main loop.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
}, []string{"function"},
)

functionDurationSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
functionDurationSummary = k8smetrics.NewSummaryVec(
&k8smetrics.SummaryOpts{
Namespace: caNamespace,
Name: "function_duration_quantile_seconds",
Help: "Quantiles of time taken by various parts of CA main loop.",
Expand All @@ -152,97 +153,97 @@ var (
)

/**** Metrics related to autoscaler operations ****/
errorsCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
errorsCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "errors_total",
Help: "The number of CA loops failed due to an error.",
}, []string{"type"},
)

scaleUpCount = prometheus.NewCounter(
prometheus.CounterOpts{
scaleUpCount = k8smetrics.NewCounter(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "scaled_up_nodes_total",
Help: "Number of nodes added by CA.",
},
)

gpuScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
gpuScaleUpCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "scaled_up_gpu_nodes_total",
Help: "Number of GPU nodes added by CA, by GPU name.",
}, []string{"gpu_name"},
)

failedScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
failedScaleUpCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "failed_scale_ups_total",
Help: "Number of times scale-up operation has failed.",
}, []string{"reason"},
)

scaleDownCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
scaleDownCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "scaled_down_nodes_total",
Help: "Number of nodes removed by CA.",
}, []string{"reason"},
)

gpuScaleDownCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
gpuScaleDownCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "scaled_down_gpu_nodes_total",
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
}, []string{"reason", "gpu_name"},
)

evictionsCount = prometheus.NewCounter(
prometheus.CounterOpts{
evictionsCount = k8smetrics.NewCounter(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "evicted_pods_total",
Help: "Number of pods evicted by CA",
},
)

unneededNodesCount = prometheus.NewGauge(
prometheus.GaugeOpts{
unneededNodesCount = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "unneeded_nodes_count",
Help: "Number of nodes currently considered unneeded by CA.",
},
)

scaleDownInCooldown = prometheus.NewGauge(
prometheus.GaugeOpts{
scaleDownInCooldown = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "scale_down_in_cooldown",
Help: "Whether or not the scale down is in cooldown. 1 if its, 0 otherwise.",
},
)

/**** Metrics related to NodeAutoprovisioning ****/
napEnabled = prometheus.NewGauge(
prometheus.GaugeOpts{
napEnabled = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "nap_enabled",
Help: "Whether or not Node Autoprovisioning is enabled. 1 if it is, 0 otherwise.",
},
)

nodeGroupCreationCount = prometheus.NewCounter(
prometheus.CounterOpts{
nodeGroupCreationCount = k8smetrics.NewCounter(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "created_node_groups_total",
Help: "Number of node groups created by Node Autoprovisioning.",
},
)

nodeGroupDeletionCount = prometheus.NewCounter(
prometheus.CounterOpts{
nodeGroupDeletionCount = k8smetrics.NewCounter(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "deleted_node_groups_total",
Help: "Number of node groups deleted by Node Autoprovisioning.",
Expand All @@ -252,25 +253,25 @@ var (

// RegisterAll registers all metrics.
func RegisterAll() {
prometheus.MustRegister(clusterSafeToAutoscale)
prometheus.MustRegister(nodesCount)
prometheus.MustRegister(nodeGroupsCount)
prometheus.MustRegister(unschedulablePodsCount)
prometheus.MustRegister(lastActivity)
prometheus.MustRegister(functionDuration)
prometheus.MustRegister(functionDurationSummary)
prometheus.MustRegister(errorsCount)
prometheus.MustRegister(scaleUpCount)
prometheus.MustRegister(gpuScaleUpCount)
prometheus.MustRegister(failedScaleUpCount)
prometheus.MustRegister(scaleDownCount)
prometheus.MustRegister(gpuScaleDownCount)
prometheus.MustRegister(evictionsCount)
prometheus.MustRegister(unneededNodesCount)
prometheus.MustRegister(scaleDownInCooldown)
prometheus.MustRegister(napEnabled)
prometheus.MustRegister(nodeGroupCreationCount)
prometheus.MustRegister(nodeGroupDeletionCount)
legacyregistry.MustRegister(clusterSafeToAutoscale)
legacyregistry.MustRegister(nodesCount)
legacyregistry.MustRegister(nodeGroupsCount)
legacyregistry.MustRegister(unschedulablePodsCount)
legacyregistry.MustRegister(lastActivity)
legacyregistry.MustRegister(functionDuration)
legacyregistry.MustRegister(functionDurationSummary)
legacyregistry.MustRegister(errorsCount)
legacyregistry.MustRegister(scaleUpCount)
legacyregistry.MustRegister(gpuScaleUpCount)
legacyregistry.MustRegister(failedScaleUpCount)
legacyregistry.MustRegister(scaleDownCount)
legacyregistry.MustRegister(gpuScaleDownCount)
legacyregistry.MustRegister(evictionsCount)
legacyregistry.MustRegister(unneededNodesCount)
legacyregistry.MustRegister(scaleDownInCooldown)
legacyregistry.MustRegister(napEnabled)
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
}

// UpdateDurationFromStart records the duration of the step identified by the
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/simulator/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ func findPlaceFor(removedNode string, pods []*apiv1.Pod, nodes []*apiv1.Node, no

loggingQuota := glogx.PodsLoggingQuota()

tryNodeForPod := func(nodename string, pod *apiv1.Pod, predicateMeta predicates.PredicateMetadata) bool {
tryNodeForPod := func(nodename string, pod *apiv1.Pod, predicateMeta predicates.Metadata) bool {
nodeInfo, found := newNodeInfos[nodename]
if found {
if nodeInfo.Node() == nil {
Expand Down
Loading