Skip to content
This repository has been archived by the owner on Jun 29, 2022. It is now read-only.

Implement automatic certificate rotation #1201

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ spec:
annotations:
checkpointer.alpha.coreos.com/checkpoint: "true"
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
# Automatically rolls update when secret changes.
checksum/secret: {{ include (print $.Template.BasePath "/kube-apiserver-secret.yaml") . | sha256sum }}
spec:
{{- template "containers" . }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ spec:
annotations:
checkpointer.alpha.coreos.com/checkpoint: "true"
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
# Automatically rolls update when secret changes.
checksum/secret: {{ include (print $.Template.BasePath "/kube-apiserver-secret.yaml") . | sha256sum }}
spec:
{{- template "containers" . }}
{{- end }}
18 changes: 17 additions & 1 deletion assets/charts/control-plane/kubelet/templates/kubelet-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,23 @@ spec:
tier: node
k8s-app: kubelet
spec:
automountServiceAccountToken: false
initContainers:
- name: ca-syncer
image: {{ .Values.image }}
command:
- bash
- -c
- |
sed -i "s/^ certificate-authority-data:.*$/ certificate-authority-data: {{ .Values.kubernetesCACert }}/g" /var/lib/kubelet/kubeconfig /etc/kubernetes/kubeconfig
securityContext:
privileged: true
volumeMounts:
- name: var-lib-kubelet
mountPath: /var/lib/kubelet
readOnly: false
- name: etc-kubernetes
mountPath: /etc/kubernetes
readOnly: false
containers:
- name: kubelet
image: {{ .Values.image }}
Expand Down
1 change: 1 addition & 0 deletions assets/charts/control-plane/kubelet/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ clusterDNS: 10.0.0.10
clusterDomain: cluster.local
enableTLSBootstrap: true
cloudProvider:
kubernetesCACert: ""
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
- --pod-eviction-timeout=1m
- --root-ca-file=/etc/kubernetes/secrets/ca.crt
- --service-account-private-key-file=/etc/kubernetes/secrets/service-account.key
- --cluster-signing-duration=45m
livenessProbe:
httpGet:
scheme: HTTPS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ spec:
k8s-app: kube-controller-manager
annotations:
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
# Automatically rolls update when secret changes.
checksum/secret: {{ include (print $.Template.BasePath "/kube-controller-manager-secret.yaml") . | sha256sum }}
spec:
{{- template "controller-manager-containers" . }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ spec:
k8s-app: kube-controller-manager
annotations:
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
# Automatically rolls update when secret changes.
checksum/secret: {{ include (print $.Template.BasePath "/kube-controller-manager-secret.yaml") . | sha256sum }}
spec:
{{- template "controller-manager-containers" . }}
{{- end }}
6 changes: 5 additions & 1 deletion assets/terraform-modules/aws/flatcar-linux/kubernetes/ssh.tf
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,15 @@ resource "null_resource" "copy-controller-secrets" {
"sudo mv etcd-peer.key /etc/ssl/etcd/etcd/peer.key",
"sudo chown -R etcd:etcd /etc/ssl/etcd",
"sudo chmod -R 500 /etc/ssl/etcd",
"sudo systemctl restart etcd",
]
}

triggers = {
controller_id = aws_instance.controllers[count.index].id
controller_id = aws_instance.controllers[count.index].id
etcd_ca_cert = module.bootkube.etcd_ca_cert
etcd_server_cert = module.bootkube.etcd_server_cert
etcd_peer_cert = module.bootkube.etcd_peer_cert
}
}

Expand Down
1 change: 1 addition & 0 deletions assets/terraform-modules/bootkube/assets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ locals {
cluster_domain_suffix = var.cluster_domain_suffix
enable_tls_bootstrap = var.enable_tls_bootstrap
cloud_provider = var.cloud_provider
kubernetes_ca_cert = base64encode(tls_self_signed_cert.kube-ca.cert_pem)
})

kubeconfig_kubelet_content = templatefile("${path.module}/resources/kubeconfig-kubelet", {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ clusterDNS: ${cluster_dns_service_ip}
clusterDomain: ${cluster_domain_suffix}
enableTLSBootstrap: ${enable_tls_bootstrap}
cloudProvider: ${cloud_provider}
kubernetesCACert: ${kubernetes_ca_cert}
34 changes: 34 additions & 0 deletions cli/cmd/cluster/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ func Apply(contextLogger *log.Entry, options ApplyOptions) error {
}
}

// If we are not on managed platform, rotate all certificates while upgrading.
if exists && !c.platform.Meta().Managed {
if err := c.taintCertificates(); err != nil {
return fmt.Errorf("tainting certificate resources: %w", err)
}
}

if err := c.platform.Apply(&c.terraformExecutor); err != nil {
return fmt.Errorf("applying platform: %v", err)
}
Expand Down Expand Up @@ -145,6 +152,33 @@ func Apply(contextLogger *log.Entry, options ApplyOptions) error {
return fmt.Errorf("upgrading controlplane component %q: %w", c.Name, err)
}
}

cs, err := k8sutil.NewClientset(kubeconfig)
if err != nil {
return fmt.Errorf("creating clientset from kubeconfig: %w", err)
}

newCACert, err := c.readKubernetesCAFromTerraformOutput()
if err != nil {
return fmt.Errorf("reading Kubernetes CA certificate from Terraform output: %w", err)
}

crConfig := certificateRotatorConfig{
clientSet: cs,
newCACert: newCACert,
logger: contextLogger,
daemonSetsToRestart: c.platform.Meta().DaemonSets,
deploymentsToRestart: c.platform.Meta().Deployments,
}

cr, err := newCertificateRotator(crConfig)
if err != nil {
return fmt.Errorf("preparing certificate rotator: %w", err)
}

if err := cr.rotate(); err != nil {
return fmt.Errorf("rotating certificates: %w", err)
}
}

if ph, ok := c.platform.(platform.PlatformWithPostApplyHook); ok {
Expand Down
158 changes: 158 additions & 0 deletions cli/cmd/cluster/certificate-rotator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package cluster

import (
"context"
"fmt"
"time"

log "github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"

"github.com/kinvolk/lokomotive/pkg/k8sutil"
"github.com/kinvolk/lokomotive/pkg/platform"
)

type certificateRotatorConfig struct {
clientSet *kubernetes.Clientset
newCACert string
logger *log.Entry
daemonSetsToRestart []platform.Workload
deploymentsToRestart []platform.Workload
}

type certificateRotator struct {
config certificateRotatorConfig
}

const (
// Time to wait between updating DaemonSet/Deployment and start looking if
// workload has converged. This should account for kube-controller-manager election time,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might it be an idea to watch the leader election to ensure this instead of guessing a time

// reconciliation period and time spent in reconciliation loop (based on number of workloads
// in the cluster). 10 seconds might not be enough.
kubeControllerManagerReconciliationPeriod = 10 * time.Second
)

func newCertificateRotator(config certificateRotatorConfig) (*certificateRotator, error) {
if config.clientSet == nil {
return nil, fmt.Errorf("clientSet can't be nil")
}

if config.newCACert == "" {
return nil, fmt.Errorf("new CA certificate can't be empty")
}

return &certificateRotator{
config: config,
}, nil
}

func (cr *certificateRotator) restartDaemonSetAndWaitToConverge(ns, name string) error {
dsc := cr.config.clientSet.AppsV1().DaemonSets(ns)

generation, err := k8sutil.RolloutRestartDaemonSet(dsc, name)
if err != nil {
return fmt.Errorf("restarting: %w", err)
}

// TODO: make sure this is the right value.
time.Sleep(kubeControllerManagerReconciliationPeriod)

options := k8sutil.WaitOptions{
Generation: generation,
}

if err := k8sutil.WaitForDaemonSet(dsc, name, options); err != nil {
return fmt.Errorf("waiting for DaemonSet to converge: %w", err)
}

return nil
}

func (cr *certificateRotator) restartDeploymentAndWaitToConverge(ns, name string) error {
deployClient := cr.config.clientSet.AppsV1().Deployments(ns)

generation, err := k8sutil.RolloutRestartDeployment(deployClient, name)
if err != nil {
return fmt.Errorf("restarting: %w", err)
}

// TODO: make sure this is the right value.
time.Sleep(kubeControllerManagerReconciliationPeriod)

options := k8sutil.WaitOptions{
Generation: generation,
}

if err := k8sutil.WaitForDeployment(deployClient, name, options); err != nil {
return fmt.Errorf("waiting for Deployment to converge: %w", err)
}

return nil
}

func (cr *certificateRotator) rotate() error {
cr.config.logger.Printf("Waiting for all service account tokens on the cluster to be updated...")

if err := cr.waitForUpdatedServiceAccountTokens(); err != nil {
return fmt.Errorf("waiting for all service account tokens to be updated: %w", err)
}

cr.config.logger.Printf("All service account tokens has been updated with new Kubernetes CA certificate")

for _, daemonSet := range cr.config.daemonSetsToRestart {
cr.config.logger.Printf("Restarting DaemonSet %s/%s to pick up new Kubernetes CA Certificate",
daemonSet.Namespace, daemonSet.Name)

if err := cr.restartDaemonSetAndWaitToConverge(daemonSet.Namespace, daemonSet.Name); err != nil {
return fmt.Errorf("restarting DaemonSet %s/%s: %w", daemonSet.Namespace, daemonSet.Name, err)
}
}

for _, deployment := range cr.config.deploymentsToRestart {
cr.config.logger.Printf("Restarting Deployment %s/%s to pick up new Kubernetes CA Certificate",
deployment.Namespace, deployment.Name)

if err := cr.restartDeploymentAndWaitToConverge(deployment.Namespace, deployment.Name); err != nil {
return fmt.Errorf("restarting Deployment %s/%s: %w", deployment.Namespace, deployment.Name, err)
}
}

return nil
}

func (cr *certificateRotator) waitForUpdatedServiceAccountTokens() error {
for {
allUpToDate, err := cr.allServiceAccountTokensIncludeNewCA()
if err != nil {
return fmt.Errorf("checking if all service account tokens include new CA certificate: %w", err)
}

if allUpToDate {
cr.config.logger.Printf("all service account tokens are up to date and have new CA certificate")

break
}
}

return nil
}

func (cr *certificateRotator) allServiceAccountTokensIncludeNewCA() (bool, error) {
secrets, err := cr.config.clientSet.CoreV1().Secrets("").List(context.TODO(), metav1.ListOptions{
FieldSelector: "type=kubernetes.io/service-account-token",
})
if err != nil {
return false, fmt.Errorf("getting secrets: %v", err)
}

allUpToDate := true

for _, v := range secrets.Items {
if string(v.Data["ca.crt"]) != cr.config.newCACert {
allUpToDate = false
}
}

return allUpToDate, nil
}
62 changes: 62 additions & 0 deletions cli/cmd/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package cluster

import (
"encoding/base64"
"fmt"
"path/filepath"

Expand Down Expand Up @@ -332,3 +333,64 @@ func (c controlplaneUpdater) ensureComponent(component, namespace string) error

return nil
}

// taintCertificates taints all certificate resources in existing Terraform state.
func (c *cluster) taintCertificates() error {
f := func(resourceName string) string {
m := c.platform.Meta()

return fmt.Sprintf("module.%s-%s.module.bootkube.%s", m.Name, m.ClusterName, resourceName)
}

steps := []terraform.ExecutionStep{}
targets := []string{
"tls_locally_signed_cert.admin",
"tls_locally_signed_cert.admission-webhook-server",
"tls_locally_signed_cert.aggregation-client[0]",
"tls_locally_signed_cert.apiserver",
"tls_locally_signed_cert.client",
"tls_locally_signed_cert.kubelet",
"tls_locally_signed_cert.peer",
"tls_locally_signed_cert.server",
"tls_self_signed_cert.aggregation-ca[0]",
"tls_self_signed_cert.etcd-ca",
"tls_self_signed_cert.kube-ca",
}

for _, t := range targets {
steps = append(steps, terraform.ExecutionStep{
Args: []string{"taint", f(t)},
})
}

if err := c.terraformExecutor.Execute(steps...); err != nil {
return fmt.Errorf("tainting existing certificates: %w", err)
}

return nil
}

func (c *cluster) readKubernetesCAFromTerraformOutput() (string, error) {
valuesRaw := ""

if err := c.terraformExecutor.Output("kubernetes_values", &valuesRaw); err != nil {
return "", fmt.Errorf("getting %q release values from Terraform output: %w", "kubernetes", err)
}

values := &struct {
ControllerManager struct {
CACert string `json:"caCert"`
} `json:"controllerManager"`
}{}

if err := yaml.Unmarshal([]byte(valuesRaw), values); err != nil {
return "", fmt.Errorf("parsing kubeconfig values: %w", err)
}

caCert, err := base64.StdEncoding.DecodeString(values.ControllerManager.CACert)
if err != nil {
return "", fmt.Errorf("base64 decode: %w", err)
}

return string(caCert), nil
}
2 changes: 2 additions & 0 deletions examples/aws-testing/cluster.lokocfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ cluster "aws" {
dns_zone_id = var.route53_zone_id
ssh_pubkeys = var.ssh_public_keys

certs_validity_period_hours = 1

//os_channel = "stable"
//os_version = "current"

Expand Down
Loading