Skip to content

Commit

Permalink
CNS - Ensuring no stale NCs during NNC reconcile (#2192)
Browse files Browse the repository at this point in the history
* ensuring no stale ncs during nnc reconcile

* only save state if mutated

* ensuring we only remove stale ncs if none of their ips are assigned
  • Loading branch information
ramiro-gamarra authored Sep 8, 2023
1 parent 1360e02 commit 666f36c
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 5 deletions.
10 changes: 10 additions & 0 deletions cns/kubecontroller/nodenetworkconfig/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

type cnsClient interface {
CreateOrUpdateNetworkContainerInternal(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode
MustEnsureNoStaleNCs(validNCIDs []string)
}

type nodeNetworkConfigListener interface {
Expand Down Expand Up @@ -74,6 +75,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco

ipAssignments := 0

// during node upgrades, an nnc may be updated with new ncs. at any given time, only the ncs
// that exist in the nnc are valid. any others that may have been previously created and no
// longer exist in the nnc should be considered stale.
validNCIDs := make([]string, len(nnc.Status.NetworkContainers))
for i := range nnc.Status.NetworkContainers {
validNCIDs[i] = nnc.Status.NetworkContainers[i].ID
}
r.cnscli.MustEnsureNoStaleNCs(validNCIDs)

// for each NC, parse it in to a CreateNCRequest and forward it to the appropriate Listener
for i := range nnc.Status.NetworkContainers {
// check if this NC matches the Node IP if we have one to check against
Expand Down
89 changes: 84 additions & 5 deletions cns/kubecontroller/nodenetworkconfig/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import (
)

type cnsClientState struct {
req *cns.CreateNetworkContainerRequest
nnc *v1alpha.NodeNetworkConfig
reqsByNCID map[string]*cns.CreateNetworkContainerRequest
nnc *v1alpha.NodeNetworkConfig
}

type mockCNSClient struct {
Expand All @@ -29,10 +29,23 @@ type mockCNSClient struct {
}

func (m *mockCNSClient) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) cnstypes.ResponseCode {
m.state.req = req
m.state.reqsByNCID[req.NetworkContainerid] = req
return m.createOrUpdateNC(req)
}

func (m *mockCNSClient) MustEnsureNoStaleNCs(validNCIDs []string) {
valid := make(map[string]struct{})
for _, ncID := range validNCIDs {
valid[ncID] = struct{}{}
}

for ncID := range m.state.reqsByNCID {
if _, ok := valid[ncID]; !ok {
delete(m.state.reqsByNCID, ncID)
}
}
}

func (m *mockCNSClient) Update(nnc *v1alpha.NodeNetworkConfig) error {
m.state.nnc = nnc
return m.update(nnc)
Expand Down Expand Up @@ -112,7 +125,7 @@ func TestReconcile(t *testing.T) {
},
wantErr: true,
wantCNSClientState: cnsClientState{
req: validSwiftRequest,
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
},
},
{
Expand All @@ -137,7 +150,7 @@ func TestReconcile(t *testing.T) {
},
wantErr: false,
wantCNSClientState: cnsClientState{
req: validSwiftRequest,
reqsByNCID: map[string]*cns.CreateNetworkContainerRequest{validSwiftRequest.NetworkContainerid: validSwiftRequest},
nnc: &v1alpha.NodeNetworkConfig{
Status: validSwiftStatus,
Spec: v1alpha.NodeNetworkConfigSpec{
Expand Down Expand Up @@ -173,6 +186,11 @@ func TestReconcile(t *testing.T) {
}
for _, tt := range tests {
tt := tt
tt.cnsClient.state.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
if tt.wantCNSClientState.reqsByNCID == nil {
tt.wantCNSClientState.reqsByNCID = make(map[string]*cns.CreateNetworkContainerRequest)
}

t.Run(tt.name, func(t *testing.T) {
r := NewReconciler(&tt.cnsClient, &tt.cnsClient, tt.nodeIP)
r.nnccli = &tt.ncGetter
Expand All @@ -187,3 +205,64 @@ func TestReconcile(t *testing.T) {
})
}
}

func TestReconcileStaleNCs(t *testing.T) {
logger.InitLogger("", 0, 0, "")

cnsClient := mockCNSClient{
state: cnsClientState{reqsByNCID: make(map[string]*cns.CreateNetworkContainerRequest)},
createOrUpdateNC: func(*cns.CreateNetworkContainerRequest) cnstypes.ResponseCode { return cnstypes.Success },
update: func(*v1alpha.NodeNetworkConfig) error { return nil },
}

nodeIP := "10.0.0.10"

nncv1 := v1alpha.NodeNetworkConfig{
Status: v1alpha.NodeNetworkConfigStatus{
NetworkContainers: []v1alpha.NetworkContainer{
{ID: "nc1", PrimaryIP: "10.1.0.10", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
{ID: "nc2", PrimaryIP: "10.1.0.11", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
},
},
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
}

nncv2 := v1alpha.NodeNetworkConfig{
Status: v1alpha.NodeNetworkConfigStatus{
NetworkContainers: []v1alpha.NetworkContainer{
{ID: "nc3", PrimaryIP: "10.1.0.12", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
{ID: "nc4", PrimaryIP: "10.1.0.13", SubnetAddressSpace: "10.1.0.0/24", NodeIP: nodeIP},
},
},
Spec: v1alpha.NodeNetworkConfigSpec{RequestedIPCount: 10},
}

i := 0
nncIterator := func(context.Context, types.NamespacedName) (*v1alpha.NodeNetworkConfig, error) {
nncLog := []v1alpha.NodeNetworkConfig{nncv1, nncv2}
for i < len(nncLog) {
j := i
i++
return &nncLog[j], nil
}

return &nncLog[len(nncLog)-1], nil
}

r := NewReconciler(&cnsClient, &cnsClient, nodeIP)
r.nnccli = &mockNCGetter{get: nncIterator}

_, err := r.Reconcile(context.Background(), reconcile.Request{})
require.NoError(t, err)

assert.Contains(t, cnsClient.state.reqsByNCID, "nc1")
assert.Contains(t, cnsClient.state.reqsByNCID, "nc2")

_, err = r.Reconcile(context.Background(), reconcile.Request{})
require.NoError(t, err)

assert.NotContains(t, cnsClient.state.reqsByNCID, "nc1")
assert.NotContains(t, cnsClient.state.reqsByNCID, "nc2")
assert.Contains(t, cnsClient.state.reqsByNCID, "nc3")
assert.Contains(t, cnsClient.state.reqsByNCID, "nc4")
}
37 changes: 37 additions & 0 deletions cns/restserver/internalapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,43 @@ func (service *HTTPRestService) DeleteNetworkContainerInternal(
return types.Success
}

func (service *HTTPRestService) MustEnsureNoStaleNCs(validNCIDs []string) {
valid := make(map[string]struct{})
for _, ncID := range validNCIDs {
valid[ncID] = struct{}{}
}

service.Lock()
defer service.Unlock()

ncIDToAssignedIPs := make(map[string][]cns.IPConfigurationStatus)
for _, ipInfo := range service.PodIPConfigState { // nolint:gocritic // copy is fine; it's a larger change to modify the map to hold pointers
if ipInfo.GetState() == types.Assigned {
ncIDToAssignedIPs[ipInfo.NCID] = append(ncIDToAssignedIPs[ipInfo.NCID], ipInfo)
}
}

mutated := false
for ncID := range service.state.ContainerStatus {
if _, ok := valid[ncID]; !ok {
// stale NCs with assigned IPs are an unexpected CNS state which we need to alert on.
if assignedIPs, hasAssignedIPs := ncIDToAssignedIPs[ncID]; hasAssignedIPs {
msg := fmt.Sprintf("Unexpected state: found stale NC ID %s in CNS state with %d assigned IPs: %+v", ncID, len(assignedIPs), assignedIPs)
logger.Errorf(msg)
panic(msg)
}

logger.Errorf("[Azure CNS] Found stale NC ID %s in CNS state. Removing...", ncID)
delete(service.state.ContainerStatus, ncID)
mutated = true
}
}

if mutated {
_ = service.saveState()
}
}

// This API will be called by CNS RequestController on CRD update.
func (service *HTTPRestService) CreateOrUpdateNetworkContainerInternal(req *cns.CreateNetworkContainerRequest) types.ResponseCode {
if req.NetworkContainerid == "" {
Expand Down
151 changes: 151 additions & 0 deletions cns/restserver/internalapi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/Azure/azure-container-networking/cns/types"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
nma "github.com/Azure/azure-container-networking/nmagent"
"github.com/Azure/azure-container-networking/store"
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -1275,3 +1276,153 @@ func TestCNIConflistGenerationOnNMAError(t *testing.T) {
time.Sleep(time.Second)
assert.Equal(t, 1, mockgen.getGeneratedCount())
}

func TestMustEnsureNoStaleNCs(t *testing.T) {
tests := []struct {
name string
storedNCs map[string]containerstatus
ipStates map[string]cns.IPConfigurationStatus
ipStatesOverrideFunc func(map[string]cns.IPConfigurationStatus) // defaults to available
ncsFromReconcile []string
expectedRemovedNCs []string
}{
{
name: "normal reconcile, single nc",
storedNCs: map[string]containerstatus{
"nc1": {},
},
ipStates: map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
},
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
for k, is := range ipStates {
is.SetState(types.Assigned)
ipStates[k] = is
}
},
ncsFromReconcile: []string{"nc1"},
expectedRemovedNCs: []string{},
},
{
name: "normal reconcile, dual stack ncs",
storedNCs: map[string]containerstatus{
"nc1": {},
"nc2": {},
},
ipStates: map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
},
ipStatesOverrideFunc: func(ipStates map[string]cns.IPConfigurationStatus) {
for k, is := range ipStates {
is.SetState(types.Assigned)
ipStates[k] = is
}
},
ncsFromReconcile: []string{"nc1", "nc2"},
expectedRemovedNCs: []string{},
},
{
name: "stale nc",
storedNCs: map[string]containerstatus{
"nc1": {},
},
ipStates: map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
},
ncsFromReconcile: []string{"nc2"},
expectedRemovedNCs: []string{"nc1"},
},
{
name: "stale dual stack ncs",
storedNCs: map[string]containerstatus{
"nc1": {},
"nc2": {},
},
ipStates: map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
},
ncsFromReconcile: []string{"nc3", "nc4"},
expectedRemovedNCs: []string{"nc1", "nc2"},
},
{
name: "stale ncs, ips in flight",
storedNCs: map[string]containerstatus{
"nc1": {},
"nc2": {},
},
ipStates: map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: "nc1"},
"aabb": {IPAddress: "fe80::1ff:fe23:4567:890a", NCID: "nc2"},
},
ipStatesOverrideFunc: func(m map[string]cns.IPConfigurationStatus) {
ip1 := m["aaaa"]
ip1.SetState(types.PendingRelease)
m["aaaa"] = ip1

ip2 := m["aabb"]
ip2.SetState(types.PendingProgramming)
m["aabb"] = ip2
},
ncsFromReconcile: []string{"nc3", "nc4"},
expectedRemovedNCs: []string{"nc1", "nc2"},
},
}

for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
if tt.ipStatesOverrideFunc != nil {
tt.ipStatesOverrideFunc(tt.ipStates)
} else {
for k, is := range tt.ipStates {
is.SetState(types.Available)
tt.ipStates[k] = is
}
}

svc := HTTPRestService{
store: store.NewMockStore("foo"),
state: &httpRestServiceState{ContainerStatus: tt.storedNCs},
PodIPConfigState: tt.ipStates,
}

require.NotPanics(t, func() {
svc.MustEnsureNoStaleNCs(tt.ncsFromReconcile)
})

for _, expectedRemovedNCID := range tt.expectedRemovedNCs {
assert.NotContains(t, svc.state.ContainerStatus, expectedRemovedNCID)
}
})
}
}

func TestMustEnsureNoStaleNCs_PanicsWhenIPsFromStaleNCAreAssigned(t *testing.T) {
staleNC1 := "nc1"
staleNC2 := "nc2"

ncs := map[string]containerstatus{staleNC1: {}, staleNC2: {}}

ipStates := map[string]cns.IPConfigurationStatus{
"aaaa": {IPAddress: "10.0.0.10", NCID: staleNC1},
"aabb": {IPAddress: "10.0.0.11", NCID: staleNC1},
"aacc": {IPAddress: "10.0.0.12", NCID: staleNC2},
"aadd": {IPAddress: "10.0.0.13", NCID: staleNC2},
}
for k, is := range ipStates {
is.SetState(types.Assigned)
ipStates[k] = is
}

svc := HTTPRestService{
store: store.NewMockStore("foo"),
state: &httpRestServiceState{ContainerStatus: ncs},
PodIPConfigState: ipStates,
}

require.Panics(t, func() {
svc.MustEnsureNoStaleNCs([]string{"nc3", "nc4"})
})
}

0 comments on commit 666f36c

Please sign in to comment.