-
Notifications
You must be signed in to change notification settings - Fork 4k
/
cloud_provider.go
360 lines (305 loc) · 15.3 KB
/
cloud_provider.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cloudprovider
import (
"fmt"
"time"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
)
const (
// AzureProviderName gets the provider name of azure
AzureProviderName = "azure"
// AlicloudProviderName gets the provider name of alicloud
AlicloudProviderName = "alicloud"
// AwsProviderName gets the provider name of aws
AwsProviderName = "aws"
// BaiducloudProviderName gets the provider name of baiducloud
BaiducloudProviderName = "baiducloud"
// BizflyCloudProviderName gets the provider name of bizflycloud
BizflyCloudProviderName = "bizflycloud"
// BrightboxProviderName gets the provider name of brightbox
BrightboxProviderName = "brightbox"
// CherryServersProviderName gets the provider name of cherry servers
CherryServersProviderName = "cherryservers"
// CloudStackProviderName gets the provider name of cloudstack
CloudStackProviderName = "cloudstack"
// ClusterAPIProviderName gets the provider name of clusterapi
ClusterAPIProviderName = "clusterapi"
// DigitalOceanProviderName gets the provider name of digitalocean
DigitalOceanProviderName = "digitalocean"
// ExoscaleProviderName gets the provider name of exoscale
ExoscaleProviderName = "exoscale"
// GceProviderName gets the provider name of gce
GceProviderName = "gce"
// HetznerProviderName gets the provider name of hetzner
HetznerProviderName = "hetzner"
// MagnumProviderName gets the provider name of magnum
MagnumProviderName = "magnum"
// KamateraProviderName gets the provider name of kamatera
KamateraProviderName = "kamatera"
// KubemarkProviderName gets the provider name of kubemark
KubemarkProviderName = "kubemark"
// KwokProviderName gets the provider name of kwok
KwokProviderName = "kwok"
// HuaweicloudProviderName gets the provider name of huaweicloud
HuaweicloudProviderName = "huaweicloud"
// IonoscloudProviderName gets the provider name of ionoscloud
IonoscloudProviderName = "ionoscloud"
// OracleCloudProviderName gets the provider name of oci
OracleCloudProviderName = "oci"
// OVHcloudProviderName gets the provider name of ovhcloud
OVHcloudProviderName = "ovhcloud"
// LinodeProviderName gets the provider name of linode
LinodeProviderName = "linode"
// ScalewayProviderName gets the provider name of scaleway
ScalewayProviderName = "scaleway"
// VolcengineProviderName gets the provider name of volcengine
VolcengineProviderName = "volcengine"
// VultrProviderName gets the provider name of vultr
VultrProviderName = "vultr"
// PacketProviderName gets the provider name of packet
PacketProviderName = "packet"
// EquinixMetalProviderName gets the provider name of equinixmetal
EquinixMetalProviderName = "equinixmetal"
// TencentcloudProviderName gets the provider name of tencentcloud
TencentcloudProviderName = "tencentcloud"
// ExternalGrpcProviderName gets the provider name of the external grpc provider
ExternalGrpcProviderName = "externalgrpc"
// CivoProviderName gets the provider name of civo
CivoProviderName = "civo"
// RancherProviderName gets the provider name of rancher
RancherProviderName = "rancher"
)
// GpuConfig contains the label, type and the resource name for a GPU.
type GpuConfig struct {
Label string
Type string
ResourceName apiv1.ResourceName
}
// CloudProvider contains configuration info and functions for interacting with
// cloud provider (GCE, AWS, etc).
type CloudProvider interface {
// Name returns name of the cloud provider.
Name() string
// NodeGroups returns all node groups configured for this cloud provider.
NodeGroups() []NodeGroup
// NodeGroupForNode returns the node group for the given node, nil if the node
// should not be processed by cluster autoscaler, or non-nil error if such
// occurred. Must be implemented.
NodeGroupForNode(*apiv1.Node) (NodeGroup, error)
// HasInstance returns whether the node has corresponding instance in cloud provider,
// true if the node has an instance, false if it no longer exists
HasInstance(*apiv1.Node) (bool, error)
// Pricing returns pricing model for this cloud provider or error if not available.
// Implementation optional.
Pricing() (PricingModel, errors.AutoscalerError)
// GetAvailableMachineTypes get all machine types that can be requested from the cloud provider.
// Implementation optional.
GetAvailableMachineTypes() ([]string, error)
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
// Implementation optional.
NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
taints []apiv1.Taint, extraResources map[string]resource.Quantity) (NodeGroup, error)
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
GetResourceLimiter() (*ResourceLimiter, error)
// GPULabel returns the label added to nodes with GPU resource.
GPULabel() string
// GetAvailableGPUTypes return all available GPU types cloud provider supports.
GetAvailableGPUTypes() map[string]struct{}
// GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have
// any GPUs, it returns nil.
GetNodeGpuConfig(*apiv1.Node) *GpuConfig
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
Cleanup() error
// Refresh is called before every main loop and can be used to dynamically update cloud provider state.
// In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
Refresh() error
}
// ErrNotImplemented is returned if a method is not implemented.
var ErrNotImplemented = errors.NewAutoscalerError(errors.InternalError, "Not implemented")
// ErrAlreadyExist is returned if a method already exists.
var ErrAlreadyExist = errors.NewAutoscalerError(errors.InternalError, "Already exist")
// ErrIllegalConfiguration is returned when trying to create NewNodeGroup with
// configuration that is not supported by cloudprovider.
var ErrIllegalConfiguration = errors.NewAutoscalerError(errors.InternalError, "Configuration not allowed by cloud provider")
// NodeGroup contains configuration info and functions to control a set
// of nodes that have the same capacity and set of labels.
type NodeGroup interface {
// MaxSize returns maximum size of the node group.
MaxSize() int
// MinSize returns minimum size of the node group.
MinSize() int
// TargetSize returns the current target size of the node group. It is possible that the
// number of nodes in Kubernetes is different at the moment but should be equal
// to Size() once everything stabilizes (new nodes finish startup and registration or
// removed nodes are deleted completely). Implementation required.
TargetSize() (int, error)
// IncreaseSize increases the size of the node group. To delete a node you need
// to explicitly name it and use DeleteNode. This function should wait until
// node group size is updated. Implementation required.
IncreaseSize(delta int) error
// AtomicIncreaseSize tries to increase the size of the node group atomically.
// It returns error if requesting the entire delta fails. The method doesn't wait until the new instances appear.
// Implementation is optional. Implementation of this method generally requires external cloud provider support
// for atomically requesting multiple instances. If implemented, CA will take advantage of the method while scaling up
// BestEffortAtomicScaleUp ProvisioningClass, guaranteeing that all instances required for such a
// ProvisioningRequest are provisioned atomically.
AtomicIncreaseSize(delta int) error
// DeleteNodes deletes nodes from this node group. Error is returned either on
// failure or if the given node doesn't belong to this node group. This function
// should wait until node group size is updated. Implementation required.
DeleteNodes([]*apiv1.Node) error
// ForceDeleteNodes deletes nodes from this node group, without checking for
// constraints like minimal size validation etc. Error is returned either on
// failure or if the given node doesn't belong to this node group. This function
// should wait until node group size is updated.
ForceDeleteNodes([]*apiv1.Node) error
// DecreaseTargetSize decreases the target size of the node group. This function
// doesn't permit to delete any existing node and can be used only to reduce the
// request for new nodes that have not been yet fulfilled. Delta should be negative.
// It is assumed that cloud provider will not delete the existing nodes when there
// is an option to just decrease the target. Implementation required.
DecreaseTargetSize(delta int) error
// Id returns an unique identifier of the node group.
Id() string
// Debug returns a string containing all information regarding this node group.
Debug() string
// Nodes returns a list of all nodes that belong to this node group.
// It is required that Instance objects returned by this method have Id field set.
// Other fields are optional.
// This list should include also instances that might have not become a kubernetes node yet.
Nodes() ([]Instance, error)
// TemplateNodeInfo returns a framework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a node group was expanded. The returned
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
// capacity and allocatable information as well as all pods that are started on
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
TemplateNodeInfo() (*framework.NodeInfo, error)
// Exist checks if the node group really exists on the cloud provider side. Allows to tell the
// theoretical node group from the real one. Implementation required.
Exist() bool
// Create creates the node group on the cloud provider side. Implementation optional.
Create() (NodeGroup, error)
// Delete deletes the node group on the cloud provider side.
// This will be executed only for autoprovisioned node groups, once their size drops to 0.
// Implementation optional.
Delete() error
// Autoprovisioned returns true if the node group is autoprovisioned. An autoprovisioned group
// was created by CA and can be deleted when scaled to 0.
Autoprovisioned() bool
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
// NodeGroup. Returning a nil will result in using default options.
// Implementation optional. Callers MUST handle `cloudprovider.ErrNotImplemented`.
GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error)
}
// Instance represents a cloud-provider node. The node does not necessarily map to k8s node
// i.e it does not have to be registered in k8s cluster despite being returned by NodeGroup.Nodes()
// method. Also it is sane to have Instance object for nodes which are being created or deleted.
type Instance struct {
// Id is instance id.
Id string
// Status represents status of node. (Optional)
Status *InstanceStatus
}
// InstanceStatus represents instance status.
type InstanceStatus struct {
// State tells if instance is running, being created or being deleted
State InstanceState
// ErrorInfo is not nil if there is error condition related to instance.
// E.g instance cannot be created.
ErrorInfo *InstanceErrorInfo
}
// InstanceState tells if instance is running, being created or being deleted
type InstanceState int
const (
// InstanceRunning means instance is running
InstanceRunning InstanceState = 1
// InstanceCreating means instance is being created
InstanceCreating InstanceState = 2
// InstanceDeleting means instance is being deleted
InstanceDeleting InstanceState = 3
)
// InstanceErrorInfo provides information about error condition on instance
type InstanceErrorInfo struct {
// ErrorClass tells what is class of error on instance
ErrorClass InstanceErrorClass
// ErrorCode is cloud-provider specific error code for error condition
ErrorCode string
// ErrorMessage is human readable description of error condition
ErrorMessage string
}
// InstanceErrorClass defines class of error condition
type InstanceErrorClass int
const (
// OutOfResourcesErrorClass means that error is related to lack of resources (e.g. due to
// stockout or quota-exceeded situation)
OutOfResourcesErrorClass InstanceErrorClass = 1
// OtherErrorClass means some non-specific error situation occurred
OtherErrorClass InstanceErrorClass = 99
)
func (c InstanceErrorClass) String() string {
switch c {
case OutOfResourcesErrorClass:
return "OutOfResource"
case OtherErrorClass:
return "Other"
default:
return fmt.Sprintf("%d", c)
}
}
const (
// FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created
// Note that this don't map to real nodes in k8s and are merely used for error handling
FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason"
// FakeNodeUnregistered represents a node that is identified by CA as unregistered
FakeNodeUnregistered = "unregistered"
// FakeNodeCreateError represents a node that is identified by CA as a created node with errors
FakeNodeCreateError = "create-error"
)
// PricingModel contains information about the node price and how it changes in time.
type PricingModel interface {
// NodePrice returns a price of running the given node for a given period of time.
// All prices returned by the structure should be in the same currency.
NodePrice(node *apiv1.Node, startTime time.Time, endTime time.Time) (float64, error)
// PodPrice returns a theoretical minimum price of running a pod for a given
// period of time on a perfectly matching machine.
PodPrice(pod *apiv1.Pod, startTime time.Time, endTime time.Time) (float64, error)
}
const (
// ResourceNameCores is string name for cores. It's used by ResourceLimiter.
ResourceNameCores = "cpu"
// ResourceNameMemory is string name for memory. It's used by ResourceLimiter.
// Memory should always be provided in bytes.
ResourceNameMemory = "memory"
)
// IsCustomResource checks if given resource name point denotes a gpu type
func IsCustomResource(resourceName string) bool {
// hack: we assume anything which is not cpu/memory to be a gpu.
// we are not getting anything more that a map string->limits from the user
return resourceName != ResourceNameCores && resourceName != ResourceNameMemory
}
// ContainsCustomResources returns true iff given list contains any custom resource name
func ContainsCustomResources(resources []string) bool {
for _, resource := range resources {
if IsCustomResource(resource) {
return true
}
}
return false
}