Skip to content

Commit

Permalink
Merge pull request #1906 from alexkats/neg-hc-delay
Browse files Browse the repository at this point in the history
Add delay for retrying NEG polling for unhealthy pods
  • Loading branch information
k8s-ci-robot authored Feb 6, 2023
2 parents 5b63329 + 2c6d5fe commit 710e89c
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 28 deletions.
7 changes: 6 additions & 1 deletion pkg/neg/readiness/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ const (
// GCE NEG API RPS quota is rate limited per every 100 seconds.
// Make this retry delay to match the rate limiting interval.
// More detail: https://cloud.google.com/compute/docs/api-rate-limits
retryDelay = 100 * time.Second
retryDelay = 100 * time.Second
hcRetryDelay = time.Second
)

// negMeta references a GCE NEG resource
Expand Down Expand Up @@ -255,6 +256,10 @@ func (p *poller) processHealthStatus(key negMeta, healthStatuses []*composite.Ne
}
}

if retry {
<-p.clock.After(hcRetryDelay)
}

// If we didn't patch all of the endpoints, we must keep polling for health status
return retry, utilerrors.NewAggregate(errList)
}
Expand Down
59 changes: 32 additions & 27 deletions pkg/neg/readiness/poller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ package readiness
import (
"context"
"fmt"
"github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud"
"github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter"
"google.golang.org/api/compute/v1"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/legacy-cloud-providers/gce"
"net"
"reflect"
"strconv"
"testing"
"time"

"github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud"
"github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter"
"google.golang.org/api/compute/v1"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/legacy-cloud-providers/gce"

"github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta"
"k8s.io/apimachinery/pkg/types"
"k8s.io/ingress-gce/pkg/composite"
Expand Down Expand Up @@ -387,11 +388,15 @@ func TestPoll(t *testing.T) {
},
}

pollAndValidate := func(desc string, expectErr bool, expectRetry bool, expectPatchCount int, stepClock bool) {
pollAndValidate := func(desc string, expectErr bool, expectRetry bool, expectPatchCount int, stepClock bool, healthStatusDelay bool) {
if stepClock {
go func() {
time.Sleep(2 * time.Second)
fakeClock.Step(retryDelay)
delay := retryDelay
if healthStatusDelay {
delay = hcRetryDelay
}
fakeClock.Step(delay)
}()
}
retry, err := poller.Poll(key)
Expand All @@ -416,19 +421,19 @@ func TestPoll(t *testing.T) {
polling: true,
}

pollAndValidate(step, false, true, 0, false)
pollAndValidate(step, false, true, 0, false)
pollAndValidate(step, false, true, 0, false, false)
pollAndValidate(step, false, true, 0, false, false)

step = "unmark polling"
poller.pollMap[key].polling = false
pollAndValidate(step, true, true, 0, true)
pollAndValidate(step, true, true, 0, true)
pollAndValidate(step, true, true, 0, true, false)
pollAndValidate(step, true, true, 0, true, false)

step = "NEG exists, but with no endpoint"
// create NEG, but with no endpoint
negCloud.CreateNetworkEndpointGroup(&composite.NetworkEndpointGroup{Name: negName, Zone: zone, Version: meta.VersionGA}, zone)
pollAndValidate(step, false, true, 0, false)
pollAndValidate(step, false, true, 0, false)
pollAndValidate(step, false, true, 0, true, true)
pollAndValidate(step, false, true, 0, true, true)

step = "NE added to the NEG, but NE health status is empty"
ne := &composite.NetworkEndpoint{
Expand All @@ -446,8 +451,8 @@ func TestPoll(t *testing.T) {
},
})

pollAndValidate(step, false, false, 1, false)
pollAndValidate(step, false, false, 2, false)
pollAndValidate(step, false, false, 1, false, false)
pollAndValidate(step, false, false, 2, false, false)
patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), nil)

step = "NE health status is empty and there are other endpoint with health status in NEG"
Expand All @@ -458,8 +463,8 @@ func TestPoll(t *testing.T) {
Healths: []*composite.HealthStatusForNetworkEndpoint{},
},
})
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, true, true)
pollAndValidate(step, false, true, 2, true, true)

step = "NE has nonhealthy status"
negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{
Expand All @@ -475,8 +480,8 @@ func TestPoll(t *testing.T) {
},
},
})
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, true, true)
pollAndValidate(step, false, true, 2, true, true)

step = "NE has nonhealthy status with irrelevant entry"
negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{
Expand All @@ -493,8 +498,8 @@ func TestPoll(t *testing.T) {
},
},
})
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, false)
pollAndValidate(step, false, true, 2, true, true)
pollAndValidate(step, false, true, 2, true, true)

step = "NE has unsupported health"
negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{
Expand All @@ -510,8 +515,8 @@ func TestPoll(t *testing.T) {
},
},
})
pollAndValidate(step, false, false, 3, false)
pollAndValidate(step, false, false, 4, false)
pollAndValidate(step, false, false, 3, false, false)
pollAndValidate(step, false, false, 4, false, false)

step = "NE has healthy status"
bsName := "bar"
Expand All @@ -530,9 +535,9 @@ func TestPoll(t *testing.T) {
},
irrelevantEntry,
})
pollAndValidate(step, false, false, 5, false)
pollAndValidate(step, false, false, 5, false, false)
patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), meta.GlobalKey(bsName))
pollAndValidate(step, false, false, 6, false)
pollAndValidate(step, false, false, 6, false, false)
patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), meta.GlobalKey(bsName))

step = "ListNetworkEndpoint return error response"
Expand All @@ -542,8 +547,8 @@ func TestPoll(t *testing.T) {
return nil, fmt.Errorf("random error from GCE")
}
poller.negCloud = negtypes.NewAdapter(fakeGCE)
pollAndValidate(step, true, true, 6, true)
pollAndValidate(step, true, true, 6, true)
pollAndValidate(step, true, true, 6, true, false)
pollAndValidate(step, true, true, 6, true, false)
}

func TestProcessHealthStatus(t *testing.T) {
Expand Down

0 comments on commit 710e89c

Please sign in to comment.