Skip to content

Commit

Permalink
bugfix: avoid the deadlock when failed to remove invalid sandbox
Browse files Browse the repository at this point in the history
Signed-off-by: YaoZengzeng <[email protected]>
  • Loading branch information
YaoZengzeng committed Aug 14, 2018
1 parent 218807c commit 6b064b8
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 2 deletions.
10 changes: 9 additions & 1 deletion cri/v1alpha1/cri.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,10 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
defer func() {
// If running sandbox failed, clean up the container.
if retErr != nil {
c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
err := c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
if err != nil {
logrus.Errorf("failed to remove the container when running sandbox failed: %v", err)
}
}
}()

Expand Down Expand Up @@ -462,6 +465,11 @@ func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb
return nil, fmt.Errorf("failed to list sandbox: %v", err)
}

sandboxList, err = c.filterInvalidSandboxes(ctx, sandboxList)
if err != nil {
return nil, fmt.Errorf("failed to filter invalid sandboxes: %v", err)
}

sandboxes := make([]*runtime.PodSandbox, 0, len(sandboxList))
for _, s := range sandboxList {
sandbox, err := toCriSandbox(s)
Expand Down
39 changes: 39 additions & 0 deletions cri/v1alpha1/cri_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/containerd/typeurl"
"github.com/cri-o/ocicni/pkg/ocicni"
"github.com/go-openapi/strfmt"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"
"golang.org/x/sys/unix"
"k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
Expand Down Expand Up @@ -313,6 +314,44 @@ func toCriSandbox(c *mgr.Container) (*runtime.PodSandbox, error) {
}, nil
}

// It has the possibility that we failed to run the sandbox and it is not being cleaned up.
// Kubelet will use list to get the sandboxes, but will not get the status of the failed pod
// whose meta data has not been put into the Sandbox Store. And Kubelet will keep trying to
// get the status of the failed pod and won't create a new one to replace it. It's a DEAD LOCK.
// Actually Kubelet should not know the existence of invalid pod whose meta data won't be in the
// Sandbox Store. So we could avoid the DEAD LOCK mentioned above.
func (c *CriManager) filterInvalidSandboxes(ctx context.Context, sandboxes []*mgr.Container) ([]*mgr.Container, error) {
validSandboxes, err := c.SandboxStore.Keys()
if err != nil {
return nil, err
}

var result []*mgr.Container
for _, sandbox := range sandboxes {
exist := false
for _, id := range validSandboxes {
if sandbox.ID == id {
exist = true
break
}
}
if exist {
result = append(result, sandbox)
continue
}

status := sandbox.State.Status
// NOTE: what if the worst case that we failed to remove the sandbox and
// it is still running?
if status != apitypes.StatusRunning && status != apitypes.StatusCreated {
// Remove invalid sandbox.
logrus.Warnf("filterInvalidSandboxes: remove invalid sandbox %v", sandbox.ID)
c.ContainerMgr.Remove(ctx, sandbox.ID, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
}
}
return result, nil
}

func filterCRISandboxes(sandboxes []*runtime.PodSandbox, filter *runtime.PodSandboxFilter) []*runtime.PodSandbox {
if filter == nil {
return sandboxes
Expand Down
10 changes: 9 additions & 1 deletion cri/v1alpha2/cri.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
defer func() {
// If running sandbox failed, clean up the container.
if retErr != nil {
c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
err := c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
if err != nil {
logrus.Errorf("failed to remove the container when running sandbox failed: %v", err)
}
}
}()

Expand Down Expand Up @@ -468,6 +471,11 @@ func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb
return nil, fmt.Errorf("failed to list sandbox: %v", err)
}

sandboxList, err = c.filterInvalidSandboxes(ctx, sandboxList)
if err != nil {
return nil, fmt.Errorf("failed to filter invalid sandboxes: %v", err)
}

sandboxes := make([]*runtime.PodSandbox, 0, len(sandboxList))
for _, s := range sandboxList {
sandbox, err := toCriSandbox(s)
Expand Down
38 changes: 38 additions & 0 deletions cri/v1alpha2/cri_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/containerd/typeurl"
"github.com/cri-o/ocicni/pkg/ocicni"
"github.com/go-openapi/strfmt"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"
)

Expand Down Expand Up @@ -308,6 +309,43 @@ func toCriSandbox(c *mgr.Container) (*runtime.PodSandbox, error) {
}, nil
}

// It has the possibility that we failed to run the sandbox and it is not being cleaned up.
// Kubelet will use list to get the sandboxes, but will not get the status of the failed pod
// whose meta data has not been put into the Sandbox Store. And Kubelet will keep trying to
// get the status of the failed pod and won't create a new one to replace it. It's a DEAD LOCK.
// Actually Kubelet should not know the existence of invalid pod whose meta data won't be in the
// Sandbox Store. So we could avoid the DEAD LOCK mentioned above.
func (c *CriManager) filterInvalidSandboxes(ctx context.Context, sandboxes []*mgr.Container) ([]*mgr.Container, error) {
validSandboxes, err := c.SandboxStore.Keys()
if err != nil {
return nil, err
}

var result []*mgr.Container
for _, sandbox := range sandboxes {
exist := false
for _, id := range validSandboxes {
if sandbox.ID == id {
exist = true
break
}
}
if exist {
result = append(result, sandbox)
continue
}

status := sandbox.State.Status
// NOTE: what if the worst case that we failed to remove the sandbox and
// it is still running?
if status != apitypes.StatusRunning && status != apitypes.StatusCreated {
logrus.Warnf("filterInvalidSandboxes: remove invalid sandbox %v", sandbox.ID)
c.ContainerMgr.Remove(ctx, sandbox.ID, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
}
}
return result, nil
}

func filterCRISandboxes(sandboxes []*runtime.PodSandbox, filter *runtime.PodSandboxFilter) []*runtime.PodSandbox {
if filter == nil {
return sandboxes
Expand Down

0 comments on commit 6b064b8

Please sign in to comment.