diff --git a/docs/networking.md b/docs/networking.md index 5cee69cc3e2b8..275c50300e5af 100644 --- a/docs/networking.md +++ b/docs/networking.md @@ -190,6 +190,15 @@ For help with Calico or to report any issues: Calico currently uses etcd as a backend for storing information about workloads and policies. Calico does not interfere with normal etcd operations and does not require special handling when upgrading etcd. For more information please visit the [etcd Docs](https://coreos.com/etcd/docs/latest/) +#### Calico troubleshooting +##### New nodes are taking minutes for syncing ip routes and new pods on them can't reach kubedns +This is caused by nodes in the Calico etcd nodestore no longer existing. Due to the ephemeral nature of AWS EC2 instances, new nodes are brought up with different hostnames, and nodes that are taken offline remain in the Calico nodestore. This is unlike most datacentre deployments where the hostnames are mostly static in a cluster. Read more about this issue at https://github.com/kubernetes/kops/issues/3224 +This has been solved in kops 1.8.2, when creating a new cluster no action is needed, but if the cluster was created with a prior kops version the following actions should be taken: + * Use kops to update the cluster ```kops update cluster --yes``` + * Delete all calico-node pods in kube-system namespace, so that they will apply the new env CALICO_K8S_NODE_REF and update the current nodes in etcd + * Decommission all invalid nodes, [see here](https://docs.projectcalico.org/v2.6/usage/decommissioning-a-node) + * All nodes that are deleted from the cluster after this actions should be cleaned from calico's etcd storage and the delay programming routes should be solved. + ### Canal Example for CNI and Network Policy Canal is a project that combines [Flannel](https://github.com/coreos/flannel) and [Calico](http://docs.projectcalico.org/latest/getting-started/kubernetes/installation/hosted/) for CNI Networking. It uses Flannel for networking pod traffic between hosts via VXLAN and Calico for network policy enforcement and pod to pod traffic. diff --git a/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.6.yaml.template b/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.6.yaml.template index a200143327526..c2d718a4144c9 100644 --- a/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.6.yaml.template +++ b/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.6.yaml.template @@ -158,6 +158,11 @@ spec: # Disable file logging so `kubectl logs` works. - name: CALICO_DISABLE_FILE_LOGGING value: "true" + # Set noderef for node controller. + - name: CALICO_K8S_NODE_REF + valueFrom: + fieldRef: + fieldPath: spec.nodeName # Auto-detect the BGP IP address. - name: IP value: "" @@ -322,6 +327,13 @@ spec: requests: cpu: 10m env: + # By default only policy, profile, workloadendpoint are turned + # on, node controller will decommission nodes that do not exist anymore + # this and CALICO_K8S_NODE_REF in calico-node fixes #3224, but invalid nodes that are + # already registered in calico needs to be deleted manually, see + # https://docs.projectcalico.org/v2.6/usage/decommissioning-a-node + - name: ENABLED_CONTROLLERS + value: policy,profile,workloadendpoint,node # The location of the Calico etcd cluster. - name: ETCD_ENDPOINTS valueFrom: diff --git a/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.7.yaml.template b/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.7.yaml.template index b36e517b70f6e..893d445cdcd36 100644 --- a/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.7.yaml.template +++ b/upup/models/cloudup/resources/addons/networking.projectcalico.org/k8s-1.7.yaml.template @@ -169,6 +169,11 @@ spec: # Disable file logging so `kubectl logs` works. - name: CALICO_DISABLE_FILE_LOGGING value: "true" + # Set noderef for node controller. + - name: CALICO_K8S_NODE_REF + valueFrom: + fieldRef: + fieldPath: spec.nodeName # Auto-detect the BGP IP address. - name: IP value: "" @@ -287,6 +292,13 @@ spec: requests: cpu: 10m env: + # By default only policy, profile, workloadendpoint are turned + # on, node controller will decommission nodes that do not exist anymore + # this and CALICO_K8S_NODE_REF in calico-node fixes #3224, but invalid nodes that are + # already registered in calico needs to be deleted manually, see + # https://docs.projectcalico.org/v2.6/usage/decommissioning-a-node + - name: ENABLED_CONTROLLERS + value: policy,profile,workloadendpoint,node # The location of the Calico etcd cluster. - name: ETCD_ENDPOINTS valueFrom: diff --git a/upup/pkg/fi/cloudup/bootstrapchannelbuilder.go b/upup/pkg/fi/cloudup/bootstrapchannelbuilder.go index 854345eb06de2..4bd0d76b097ab 100644 --- a/upup/pkg/fi/cloudup/bootstrapchannelbuilder.go +++ b/upup/pkg/fi/cloudup/bootstrapchannelbuilder.go @@ -476,8 +476,8 @@ func (b *BootstrapChannelBuilder) buildManifest() (*channelsapi.Addons, map[stri key := "networking.projectcalico.org" versions := map[string]string{ "pre-k8s-1.6": "2.4.1", - "k8s-1.6": "2.6.2", - "k8s-1.7": "2.6.2", + "k8s-1.6": "2.6.6-kops.1", + "k8s-1.7": "2.6.6-kops.1", } {