Skip to content

Commit

Permalink
Wait for re-attachment of PVs during serialized eviction of pods
Browse files Browse the repository at this point in the history
- Draining of pods with PV (Persistent Volume) now waits for re-attachment of PV on a different node.
- When volumeAttachments support is enabled on the cluster, it tracks volume attachments to determine this.
- Else it falls back to the default PV reattachment timeout value configured. Default value is 3mins.

Co-authored-by: Amshuman K R <[email protected]>
  • Loading branch information
prashanth26 and Amshuman K R committed Jun 8, 2021
1 parent f81374f commit ca09a31
Show file tree
Hide file tree
Showing 16 changed files with 725 additions and 120 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ managevm
controller_manager
mcm.out
kubectl
.cache_ggshield

# Binary files of MCM
./machine-controller-manager
Expand Down
3 changes: 0 additions & 3 deletions pkg/util/backoff/wait_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
"k8s.io/klog"
)

var (
Expand Down Expand Up @@ -132,8 +131,6 @@ var _ = Describe("#wait", func() {
action: action{
operation: func() error {
invokationCount += 1
klog.Error(invokationCount)

if invokationCount > 4 {
return nil
}
Expand Down
74 changes: 74 additions & 0 deletions pkg/util/k8sutils/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
Copyright (c) 2021 SAP SE or an SAP affiliate company. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package k8sutils is used to provider helper consts and functions for k8s operations
package k8sutils

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
)

const (
// VolumeAttachmentGroupName group name
VolumeAttachmentGroupName = "storage.k8s.io"
// VolumeAttachmentResourceName is the kind used for VolumeAttachment
VolumeAttachmentResourceName = "volumeattachments"
)

// IsResourceSupported uses Discovery API to find out if the server supports
// the given GroupResource.
// If supported, it will return its groupVersion; Otherwise, it will return ""
func IsResourceSupported(
clientset kubernetes.Interface,
gr schema.GroupResource,
) bool {
var (
foundDesiredGroup bool
desiredGroupVersion string
)

discoveryClient := clientset.Discovery()
groupList, err := discoveryClient.ServerGroups()
if err != nil {
return false
}

for _, group := range groupList.Groups {
if group.Name == gr.Group {
foundDesiredGroup = true
desiredGroupVersion = group.PreferredVersion.GroupVersion
break
}
}
if !foundDesiredGroup {
return false
}

resourceList, err := discoveryClient.ServerResourcesForGroupVersion(desiredGroupVersion)
if err != nil {
return false
}

for _, resource := range resourceList.APIResources {
if resource.Name == gr.Resource {
klog.V(4).Infof("Found Resource: %s/%s", gr.Group, gr.Resource)
return true
}
}
return false
}
1 change: 1 addition & 0 deletions pkg/util/provider/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ func StartControllers(s *options.MCServer,
controlCoreInformerFactory.Core().V1().Secrets(),
targetCoreInformerFactory.Core().V1().Nodes(),
targetCoreInformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
targetCoreInformerFactory.Storage().V1().VolumeAttachments(),
machineSharedInformers.MachineClasses(),
machineSharedInformers.Machines(),
recorder,
Expand Down
2 changes: 2 additions & 0 deletions pkg/util/provider/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func NewMCServer() *MCServer {
MachineDrainTimeout: metav1.Duration{Duration: drain.DefaultMachineDrainTimeout},
MaxEvictRetries: drain.DefaultMaxEvictRetries,
PvDetachTimeout: metav1.Duration{Duration: 2 * time.Minute},
PvReattachTimeout: metav1.Duration{Duration: 3 * time.Minute},
MachineSafetyOrphanVMsPeriod: metav1.Duration{Duration: 30 * time.Minute},
MachineSafetyAPIServerStatusCheckPeriod: metav1.Duration{Duration: 1 * time.Minute},
MachineSafetyAPIServerStatusCheckTimeout: metav1.Duration{Duration: 30 * time.Second},
Expand Down Expand Up @@ -100,6 +101,7 @@ func (s *MCServer) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&s.SafetyOptions.MachineDrainTimeout.Duration, "machine-drain-timeout", drain.DefaultMachineDrainTimeout, "Timeout (in durartion) used while draining of machine before deletion, beyond which MCM forcefully deletes machine.")
fs.Int32Var(&s.SafetyOptions.MaxEvictRetries, "machine-max-evict-retries", drain.DefaultMaxEvictRetries, "Maximum number of times evicts would be attempted on a pod before it is forcibly deleted during draining of a machine.")
fs.DurationVar(&s.SafetyOptions.PvDetachTimeout.Duration, "machine-pv-detach-timeout", s.SafetyOptions.PvDetachTimeout.Duration, "Timeout (in duration) used while waiting for detach of PV while evicting/deleting pods")
fs.DurationVar(&s.SafetyOptions.PvReattachTimeout.Duration, "machine-pv-reattach-timeout", s.SafetyOptions.PvReattachTimeout.Duration, "Timeout (in duration) used while waiting for reattach of PV onto a different node")
fs.DurationVar(&s.SafetyOptions.MachineSafetyAPIServerStatusCheckTimeout.Duration, "machine-safety-apiserver-statuscheck-timeout", s.SafetyOptions.MachineSafetyAPIServerStatusCheckTimeout.Duration, "Timeout (in duration) for which the APIServer can be down before declare the machine controller frozen by safety controller")

fs.DurationVar(&s.SafetyOptions.MachineSafetyOrphanVMsPeriod.Duration, "machine-safety-orphan-vms-period", s.SafetyOptions.MachineSafetyOrphanVMsPeriod.Duration, "Time period (in durartion) used to poll for orphan VMs by safety controller.")
Expand Down
Loading

0 comments on commit ca09a31

Please sign in to comment.