From 8b0d0c7137ca150bd8b6e72d5e25e1a9dc37b8eb Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Tue, 5 Dec 2023 17:57:49 -0500 Subject: [PATCH 01/13] Flesh out KEP --- .../README.md | 966 ++++++++++++++++++ .../4355-coordinated-leader-election/kep.yaml | 34 + 2 files changed, 1000 insertions(+) create mode 100644 keps/sig-api-machinery/4355-coordinated-leader-election/README.md create mode 100644 keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md new file mode 100644 index 00000000000..f3bac6a0328 --- /dev/null +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -0,0 +1,966 @@ + +# KEP-4355: Coordinated Leader Election + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Component Identity Leases](#component-identity-leases) + - [Coordinated Election Controller](#coordinated-election-controller) + - [Coordinated Lease Lock](#coordinated-lease-lock) + - [Enabling on a component](#enabling-on-a-component) + - [Comparison of leader election](#comparison-of-leader-election) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) + - [Risk: This breaks leader election in some super subtle way](#risk-this-breaks-leader-election-in-some-super-subtle-way) + - [Risk: How is the election controller elected?](#risk-how-is-the-election-controller-elected) + - [Risk: What if the election controller fails to elect a leader?](#risk-what-if-the-election-controller-fails-to-elect-a-leader) +- [Design Details](#design-details) + - [Running the Coordinated Leader Election in the APIServer](#running-the-coordinated-leader-election-in-the-apiserver) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +This proposes a component leader election mechanism that is safer for upgrades +and rollbacks. + +This leader election approach continues to use leases, but with two +key modifications: + +- Instead of a race by component instances to claim the lease, component instances + declare candidacy for a lease and a election coordinator claims the lease for the + best available candidate. This allows the election coordinator to pick a + candidate with the lowest version to ensure that skew rules are not violated. +- The election coordinator can mark a lease as "end of term" to signal to the + current leader to stop renewing the lease. This allows the election + coordinator to preempt the current leader and replace it with a better one. + +## Motivation + +The most common upgrade approach used for Kubernetes control plane components is +a node-by-node approach where all the component of a control plane node are +terminated together and then restarted at the new version. This process is +performed node-by-node across a high availability configuration. + +Systems using node-by-node upgrades: + +- Cluster API +- kubeadm +- KIND + +To respect the [Kubernetes skew policy](https://kubernetes.io/releases/version-skew-policy/): + +- Upgrades should keep controller managers and schedulers at the *old* version until all apiservers + are upgraded. +- Rollbacks should rollback controller managers and schedulers at the *old* version before any + apiservers are rolledback. + +But a node-by-node upgrade or rollback does not achieve this today. + +- For upgrade, there is about a 25% chance of a new version of the controller + running while old versions of the apiserver are active, resulting in a skew + violation. (Consider case where the 2nd node upgraded has the lease) +- For rollback, it is almost a certainty that skew will be violated. + +There is also the possiblity that the lease will be lost by a leader during an +upgrade or rollback resulting in the version of the controller flip-flopping +between old and new. + +### Goals + +- Offer an opt-in leader election mechanism to: + - Elect the candidate with the oldest version available. + - Provide a way to preempt the current leader. + - Reuse the existing lease mechanism as much as possible. + +### Non-Goals + +- Change the default leader election for components. + +## Proposal + +### Component Identity Leases + +Components will create identity leases similar to those used by apiserver identity, e.g.: + +```yaml +apiVersion: coordination.k8s.io/v1 +kind: Lease +metadata: + annotations: + coordination.k8s.io/binary-version: "1.29" + coordination.k8s.io/can-lead-leases: kube-system/sample-controller + coordination.k8s.io/compatibility-version: "1.29" + name: sample-controller-0001A + namespace: kube-system +spec: + holderIdentity: sample-controller-0001A + leaseDurationSeconds: 10 + renewTime: "2023-12-05T02:33:08.685777Z" +``` + +A component identity lease announces candidacy for leadership by including +`coordination.k8s.io/can-lead-leases` it is identity lease. If the lease +expires, the component is considered unavailable for leader election purposes. + +### Coordinated Election Controller + +A Coordinated Election Controller will reconcile component Leader Leases +(primary resource) and Identity Leases (secondary resource, changes trigger +reconciliation of related leader leases). + +Coordinated Election Controller reconciliation loop: + +- If no leader lease exists for a components: + - Elect leader from candidates by preparing a freshly renewed lease with: + - `spec.holderIdentity` set to the identity of the elected leader + - `coordination.k8s.io/elected-by: leader-election-controller` (to make + lease types easy to disambiguate) +- If there is a better candidate than current leader: + - Sets `coordination.k8s.io/end-of-term: true` on the leader lease, signaling + that the leader should stop renewing the lease and yield leadership + +```mermaid +flowchart TD + A[Reconcile] --> |Process Leader Lease| B + B{Lease Status?} --> |Better Leader Exists| D + B --> |Expired/Missing| E + D[End Lease Term] + E[Elect Leader] +``` + +Example of a lease created by Coordinated Election Controller: + +```yaml +apiVersion: coordination.k8s.io/v1 +kind: Lease +metadata: + annotations: + coordination.k8s.io/elected-by: coordinated-election-controller + name: sample-controller + namespace: kube-system +spec: + holderIdentity: controller-a + leaseDurationSeconds: 10 + leaseTransitions: 0 + renewTime: "2023-12-05T18:58:31.295467Z" +``` + +The Coordinated Election Controller will run in the kube-apiserver. + +The apiserver runs very few controllers, and they are not elected, but instead +all run concurrently in HA configurations. Because of this, controller in the +apiserver must make careful use of concurrency control primitives to ensure +multiple instances collaborate, not fight. This is discussed in depth in the +below design details. + +### Coordinated Lease Lock + +A new `resourceLock` type of `coordinatedleases`, and `CoordinatedLeaseLock` +implementation of `resourcelock.Interface` will be added to client-go that: + +- Creates Identity Lease when ready to be Leader + - Renews identity lease periodically +- Watches Leader Lease, waiting to be elected leader by the Coordinated Election Controller +- When it becomes leader: + - Perform role of active component instance + - Renew leader lease periodically + - Stop renewing if lease is marked `coordination.k8s.io/end-of-term: true` +- If leader lease expires: + - Shutdown (yielding leadership) and restart as a candidate component instance + +```mermaid +flowchart TD + A[Started] -->|Create Identity Lease| B + B[Candidate] --> |Elected| C[Leader] + C --> |Renew Leader Lease| C + C -->|End of Term / Leader Lease Expired| D[Shutdown] + D[Shutdown] -.-> |Restart| A +``` + +### Enabling on a component + +Components with a `--leader-elect-resource-lock` flag (kube-controller-manager, + kube-scheduler) will accept `coordinatedleases` as a resource lock type. + +### Comparison of leader election + +| | Lease Based Leader Election | Coordinated Leader Election | +| --------------- | -------------------------------- | ------------------------------------------------------------------------------ | +| Lock Type | Lease | Lease | +| Claimed by | Component instance | Election Coordinator. (Lease is claimed for to the elected component instance) | +| Renewed by | Component instance | Component instance | +| Leader Criteria | First component to claim lease | Best leader from available candidates at time of election | +| Preemptable | No | Yes, Collaboratively. (Coordinator marks lease as "end of term". Component instance voluntarily stops renewing) | + +### User Stories (Optional) + +#### Story 1 + +A cluster administrator upgrades a cluster's control plane node-by-node, +expecting version skew to be respected. + +- When the first and second nodes are upgraded, any components that were leaders + will typically loose the lease during the node downtime + - If one happens to retain it's lease, it will be preempted by the coordinated + election controller after it updates it's identity lease with new version + information +- When the third node is upgraded, all components will be at the new version and one + will be elected + +#### Story 2 + +A cluster administrator rolls back a cluster's control plane node-by-node, +expecting version skew to be respected. + +- When the first node is rolled back, any components that were leaders + will typically loose the lease during the node downtime +- Once one of the components updates its identity lease with new version + information, the coordinated election controller will preempt the current + leader so that this lower version component becomes leader. +- When the remaining two nodes can rollback, the first node will typically + remain leader, but if a new election occurs, the available older version + components will be elected. + +### Notes/Constraints/Caveats (Optional) + + + +### Risks and Mitigations + +#### Risk: This breaks leader election in some super subtle way + +The goal of this proposal is to minimize this risk by: + +- Continuing to renew leases in the same was as before and to never change leaders until a lease expires +- Fallback to direct lease claiming by components if a leader is not elected + +#### Risk: How is the election controller elected? + +It's not. It will run directly in the apiserver. The apiserver runs very few controllers, and they are not elected, but instead all run concurrently in HA configurations. +Requires the election controller make careful use concurrency control primitives to ensure multiple instances collaborate, not fight.* + +#### Risk: What if the election controller fails to elect a leader? + +Fallback to letting component instances self elect after a longer delay. + +## Design Details + +### Running the Coordinated Leader Election in the APIServer + +When the Coordinated Leader Election controller runs in the apiserver, it +is possible that two instances of the controller will have different +views of the candidate list. This happens when one controller has +fallen behind on a watch (which can happen for many underlying reasons). + +When two controllers have differnet candidate lists, they might "fight". +One likely way they would fight is: + +- controller A thinks X is the best leader +- controller B thinks Y is the best leader (because it has stale data from a point in time when this was true) +- controller A elects X +- controller B marks the leader lease as ""End of term" since it believes Y should be leader +- controller B elects Y as leader +- controller A marks the leader lease as ""End of term" since it believes X should be leader +- ... + +This can be avoided by tracking resourceVersion or generation numbers of +resources used to make a decision in the lease being reconciled and authoring +the controllers to not to write to a lease when the data used is stale compared +to the already tracked resourceVersion or generation numbers. + +### Test Plan + + + +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +##### Unit tests + + + + + +- `staging/src/k8s.io/client-go/tools/leaderelection`: `TODO` - `client-go coordinated leader election tests` +- `pkg/controller/leaderelection`: `TODO` - `new controller tests` +- ``: `` - `` + +##### Integration tests + + + + + +- `test/integration/coordinatedleaderelection`: TODO +- : + +##### e2e tests + + + +- `test/e2e/apimachinery/coordinatedleaderelection.go`: TODO +- : + +### Graduation Criteria + + + +### Upgrade / Downgrade Strategy + +If the `--leader-elect-resource-lock=coordinatedleases` flag is set and a +component is downgraded from beta to alpha, it will need to either remove the +flag or enable the alpha feature. All other upgrades and downgrades are safe. + + + +### Version Skew Strategy + +The feature uses leases in a standard way, so if some components instances are +configured to use the old direct leases and others are configured to use this +enhancement's coordinated leases, the component instances may still safely +share the same lease, and leaders will be safely elected. + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: + - kube-controller-manager + - kube-scheduler +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + +No, even when the feature is enabled, a component must be configured with `--leader-elect-resource-lock=coordinatedleases` to use the feature. + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +Yes, the feature uses leases in a standard way, so if some components are configured to +use direct leases and others are configured to use coordinated leases, elections will +still happen. Also, coordinated leader election falls back to direct leasing +of the election coordinator does not elect leader within a reasonable period of time, making +it safe to disable this feature in HA clusters. + +###### What happens if we reenable the feature if it was previously rolled back? + +This is safe. Leader elections would transition back to coordinated leader +elections. Any elected leaders would continue to renew their leases. + +###### Are there any tests for feature enablement/disablement? + +Yes, this will be tested, including tests where the are a mix of components +with the feature enabled and disabled. + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml b/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml new file mode 100644 index 00000000000..f7c6fa1cbe2 --- /dev/null +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml @@ -0,0 +1,34 @@ +title: Coordinated Leader Election +kep-number: 4355 +authors: + - "@jpbetz" +owning-sig: sig-api-machinery +participating-sigs: +status: provisional +creation-date: 2023-14-05 +reviewers: + - "@logicalhan" + - "@liggitt" +approvers: + - "@deads2k" +see-also: + - "keps/sig-api-machinery/1965-kube-apiserver-identity" +stage: alpha +latest-milestone: "v1.30" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.30" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: CoordinatedLeaderElection + components: + - kube-apiserver + - kube-controller-manager +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - my_feature_metric From e61905fc005ccdb87d46b5eb6987cb3457201c43 Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Thu, 7 Dec 2023 00:16:08 -0500 Subject: [PATCH 02/13] Flesh out migrations and alternatives --- .../README.md | 142 ++++++++++++++++-- 1 file changed, 131 insertions(+), 11 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index f3bac6a0328..a83f486d164 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -71,13 +71,16 @@ SIG Architecture for cross-cutting KEPs). - [Coordinated Election Controller](#coordinated-election-controller) - [Coordinated Lease Lock](#coordinated-lease-lock) - [Enabling on a component](#enabling-on-a-component) + - [Migrations](#migrations) - [Comparison of leader election](#comparison-of-leader-election) - [User Stories (Optional)](#user-stories-optional) - [Story 1](#story-1) - [Story 2](#story-2) - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) - [Risks and Mitigations](#risks-and-mitigations) - - [Risk: This breaks leader election in some super subtle way](#risk-this-breaks-leader-election-in-some-super-subtle-way) + - [Risk: Amount of writes performed by leader election increases substantially](#risk-amount-of-writes-performed-by-leader-election-increases-substantially) + - [Risk: leases watches increases apiserver load substantially](#risk-leases-watches-increases-apiserver-load-substantially) + - [Risk: We have to "start over" and build confidence in a new leader election algorithm](#risk-we-have-to-start-over-and-build-confidence-in-a-new-leader-election-algorithm) - [Risk: How is the election controller elected?](#risk-how-is-the-election-controller-elected) - [Risk: What if the election controller fails to elect a leader?](#risk-what-if-the-election-controller-fails-to-elect-a-leader) - [Design Details](#design-details) @@ -100,6 +103,8 @@ SIG Architecture for cross-cutting KEPs). - [Implementation History](#implementation-history) - [Drawbacks](#drawbacks) - [Alternatives](#alternatives) + - [Component instances pick a leader without a coordinator](#component-instances-pick-a-leader-without-a-coordinator) +- [Future Work](#future-work) - [Infrastructure Needed (Optional)](#infrastructure-needed-optional) @@ -309,6 +314,36 @@ flowchart TD Components with a `--leader-elect-resource-lock` flag (kube-controller-manager, kube-scheduler) will accept `coordinatedleases` as a resource lock type. +### Migrations + +So long as the API server is running a coordinated election controller, it is +safe to directly migrate a component from Lease Based Leader Election to +Coordinated Leader Election (or vis-versa). + +During the upgrade, a mix of components will be running both election +approaches. When the leader lease expires, there are a couple possibilities: + +- A controller instance using Lease Based Leader Election claims the leader lease +- The coordinated election controller picks a leader, from the components that + have written identity leases, and claims the lease on the leader's behalf + +Both possibilities have acceptable outcomes during the migration-- a component +is elected leader, and once elected, remains leader so long as it keeps the +lease renewed. The elected leader might not be the leader that Coordinated +Leader Election would pick, but this is no worse than how leader election works +before the upgrade, and once the upgrade is complete, Coordinated Leader +Election works as intended. + +There is one thing that could make migrations slightly cleaner: If Coordinated +Leader Election adds a +`coordination.k8s.io/elected-by: leader-election-controller` annotation to any +leases that it claims. It can also check for this annotation and only mark +leases as "end-of-term" if that annotation is present. Lease Based Leader Election +would ignore "end-of-term" annotations anyway, so this isn't strictly needed, +but it would reduce writes from the coordinated election controller to leases that +were claimed by component instances not using Coordinated Leader Election + + ### Comparison of leader election | | Lease Based Leader Election | Coordinated Leader Election | @@ -359,12 +394,45 @@ This might be a good place to talk about core concepts and how they relate. ### Risks and Mitigations -#### Risk: This breaks leader election in some super subtle way +#### Risk: Amount of writes performed by leader election increases substantially + +This enhancement introduces an identity lease for each instance of each component. + +Example: + +- HA cluster with 3 control plane nodes +- 3 elected components (kube-controller-manager, schedule, cloud-controller-manager) per control plane node +- 9 identity leases are created and renewed by the components + +Introducing this feature is roughtly equivalent to adding the same lease load +as adding 9 nodes to a kubernetes cluster. + +The [API Server Identity enhancement](../1965-kube-apiserver-identity) also +introduces similar leases. For comparison, in a HA cluster with 3 control plane +nodes, API Server Identity adds 3 leases. + +This risk can be migitated by scale testing and, if needed, extending the lease +duration and renewal times to reduce writes/s. + +#### Risk: leases watches increases apiserver load substantially + +The [Unknown Version Interoperability Proxy (UVIP) enhancement](../4020-unknown-version-interoperability-proxy) also adds +lease watches on [API Server Identity](../1965-kube-apiserver-identity) leases in the kube-system namespace. +This enhancement would increase the expected number of resources being watched from ~3 (for UVIP) to ~12. + +#### Risk: We have to "start over" and build confidence in a new leader election algorithm + +We've built confidence in the existing leasing algorithm, through an investment +of engineering effort, and in core hours testing it and running it in production. -The goal of this proposal is to minimize this risk by: +Changing the algorithm "resets the clock" and forces us to rebuild confidence on +the new algorithm. -- Continuing to renew leases in the same was as before and to never change leaders until a lease expires -- Fallback to direct lease claiming by components if a leader is not elected +The goal of this proposal is to minimize this risk by reusing as much of the +existing lease algorithm as possible: + +- Renew leases in exactly the same way as before +- Leases can never be claimed by another leader until a lease expires #### Risk: How is the election controller elected? @@ -373,7 +441,9 @@ Requires the election controller make careful use concurrency control primitives #### Risk: What if the election controller fails to elect a leader? -Fallback to letting component instances self elect after a longer delay. +Fallback to letting component instances claim the lease directly, after a longer +delay, to give the coordinated election controller an opportunity to elect +before resorting to the fallback. ## Design Details @@ -951,11 +1021,61 @@ Why should this KEP _not_ be implemented? ## Alternatives - +### Component instances pick a leader without a coordinator + +The idea of this alternative is to elect a leader from a set of candidates +without a coordinated election controller. + +Some rough ideas of how this might be done: + +1. A candidates is picked at random to be an election coordinator, and the + coordinator picks the leader: + - Components race to claim the lease + - If a component claims the lease, the first thing it does is check to see if there is a better leader + - If it finds a better lease, it assigns the lease to that component instead of itself + +Pros: + + - No coordinated election controller +Cons: + + - All component instances must watch the identity leases + - All components must have the code to decide which component is the best leader + +2. The candidates agree on the leader collectively + - Leases have "Election" and "Term" states + - Leases are first created in the "election" state. + - While in the "election" state, candidates self-nominate by updating the lease + with their identity and version information. Candidates only need to + self nominate if they are a better candidate than candidate information + already written to the lease. + - When "Election" timeout expires, the best candidate becomes the leader + - The leader sets the state to "term" and starts renewing the lease + - If the lease expires, it goes back to the "election" state + +Pros: + +- No coordinated election controller +- No identity leases + +Cons: + +- Complex election algorithm is distributed as a client-go library. A bug in the + algorithm cannot not be fixed by only upgrading kubernetes.. all controllers + in the ecosystem with the bug must upgrade client-go and release to be fixed. +- More difficult to change/customize the criteria for which candidate is best. + +If we decide in the future to shard controllers and wish to leverage coordinated +eader election to balance shards, it's much easier introduce the change in a +controller in the apiserver than in client-go library code distributed to +elected controllers. + +## Future Work + +- Controller sharding could leverage coordinated leader election to load balance + controllers against apiservers. +- Optimizations for graceful and performant failover can be built on this + enhancement. ## Infrastructure Needed (Optional) From f653f8e00a30d72afe1275bff2fa5dd223b7fbc1 Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Thu, 7 Dec 2023 09:31:53 -0500 Subject: [PATCH 03/13] organize alternatives --- .../README.md | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index a83f486d164..8081d09d862 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -104,6 +104,7 @@ SIG Architecture for cross-cutting KEPs). - [Drawbacks](#drawbacks) - [Alternatives](#alternatives) - [Component instances pick a leader without a coordinator](#component-instances-pick-a-leader-without-a-coordinator) + - [Component instances pick a leader without identity leases or a coordinator](#component-instances-pick-a-leader-without-identity-leases-or-a-coordinator) - [Future Work](#future-work) - [Infrastructure Needed (Optional)](#infrastructure-needed-optional) @@ -1021,28 +1022,38 @@ Why should this KEP _not_ be implemented? ## Alternatives -### Component instances pick a leader without a coordinator +When evaluating alternatives, note that if we decide in the future to improve +the algorithm, fix a bug in the algorithm, or change the criteria for how +leaders are elected, our decision on where to put the code has a huge impact our +how the change is rolled out. -The idea of this alternative is to elect a leader from a set of candidates -without a coordinated election controller. +For example, it will be much easier change in a controller in the kube-apiserver +than in client-go library code distributed to elected controllers, because once +it is distributed into controllers, especially 3rd party controllers, any change +requires updating client-go and then updating all controllers to that version of +client-go. -Some rough ideas of how this might be done: +### Component instances pick a leader without a coordinator -1. A candidates is picked at random to be an election coordinator, and the +- A candidates is picked at random to be an election coordinator, and the coordinator picks the leader: - Components race to claim the lease - - If a component claims the lease, the first thing it does is check to see if there is a better leader - - If it finds a better lease, it assigns the lease to that component instead of itself + - If a component claims the lease, the first thing it does is check the + identity leases to see if there is a better leader + - If it finds a better lease, it assigns the lease to that component instead + of itself Pros: - - No coordinated election controller + Cons: - All component instances must watch the identity leases - All components must have the code to decide which component is the best leader -2. The candidates agree on the leader collectively +### Component instances pick a leader without identity leases or a coordinator + +- The candidates communicate through the lease to agree on the leader - Leases have "Election" and "Term" states - Leases are first created in the "election" state. - While in the "election" state, candidates self-nominate by updating the lease @@ -1065,11 +1076,6 @@ Cons: in the ecosystem with the bug must upgrade client-go and release to be fixed. - More difficult to change/customize the criteria for which candidate is best. -If we decide in the future to shard controllers and wish to leverage coordinated -eader election to balance shards, it's much easier introduce the change in a -controller in the apiserver than in client-go library code distributed to -elected controllers. - ## Future Work - Controller sharding could leverage coordinated leader election to load balance From 88de5196e823f6391b8823c9ed8b36682b39248b Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Fri, 8 Dec 2023 11:02:22 -0500 Subject: [PATCH 04/13] improve goals --- .../README.md | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index 8081d09d862..cd45229cd5f 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -200,10 +200,20 @@ between old and new. ### Goals -- Offer an opt-in leader election mechanism to: - - Elect the candidate with the oldest version available. - - Provide a way to preempt the current leader. - - Reuse the existing lease mechanism as much as possible. +During HA upgrades/rollbacks/downgrades, + +Leader elected components: + +- Change versions at predictable times +- Do not violate version skew, even during node-by-node rollbacks + +The control plane: + +- Can safely canary components and nodes at the new version for an extended + period of time, or to pause an upgrade at any step durning an upgrade. This + enhancement, combined with + [UVIP](../4020-unknown-version-interoperability-proxy) helps achieve this. + ### Non-Goals @@ -211,6 +221,11 @@ between old and new. ## Proposal +- Offer an opt-in leader election mechanism to: + - Elect the candidate with the oldest version available. + - Provide a way to preempt the current leader. + - Reuse the existing lease mechanism as much as possible. + ### Component Identity Leases Components will create identity leases similar to those used by apiserver identity, e.g.: From 529c9d675c5e2214a9320f80a1dbbdcb77f1a288 Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Thu, 11 Jan 2024 12:57:39 -0800 Subject: [PATCH 05/13] Apply suggestions from code review Co-authored-by: Lubomir I. Ivanov --- .../4355-coordinated-leader-election/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index cd45229cd5f..b3b4afdfe93 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -252,7 +252,7 @@ expires, the component is considered unavailable for leader election purposes. ### Coordinated Election Controller -A Coordinated Election Controller will reconcile component Leader Leases +A new Coordinated Election Controller will reconcile component Leader Leases (primary resource) and Identity Leases (secondary resource, changes trigger reconciliation of related leader leases). @@ -378,9 +378,9 @@ A cluster administrator upgrades a cluster's control plane node-by-node, expecting version skew to be respected. - When the first and second nodes are upgraded, any components that were leaders - will typically loose the lease during the node downtime - - If one happens to retain it's lease, it will be preempted by the coordinated - election controller after it updates it's identity lease with new version + will typically lose the lease during the node downtime + - If one happens to retain its lease, it will be preempted by the coordinated + election controller after it updates its identity lease with new version information - When the third node is upgraded, all components will be at the new version and one will be elected @@ -430,7 +430,7 @@ nodes, API Server Identity adds 3 leases. This risk can be migitated by scale testing and, if needed, extending the lease duration and renewal times to reduce writes/s. -#### Risk: leases watches increases apiserver load substantially +#### Risk: lease watches increase apiserver load substantially The [Unknown Version Interoperability Proxy (UVIP) enhancement](../4020-unknown-version-interoperability-proxy) also adds lease watches on [API Server Identity](../1965-kube-apiserver-identity) leases in the kube-system namespace. From 22772990e95b6fc2c9f78f6218ae9a1e00cd4aeb Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Thu, 11 Jan 2024 16:02:51 -0500 Subject: [PATCH 06/13] Apply feedback --- .../4355-coordinated-leader-election/README.md | 10 +++++----- .../4355-coordinated-leader-election/kep.yaml | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index b3b4afdfe93..e4ef6c92912 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -189,7 +189,7 @@ To respect the [Kubernetes skew policy](https://kubernetes.io/releases/version-s But a node-by-node upgrade or rollback does not achieve this today. -- For upgrade, there is about a 25% chance of a new version of the controller +- For 3 node control plane upgrade, there is about a 25% chance of a new version of the controller running while old versions of the apiserver are active, resulting in a skew violation. (Consider case where the 2nd node upgraded has the lease) - For rollback, it is almost a certainty that skew will be violated. @@ -236,12 +236,12 @@ kind: Lease metadata: annotations: coordination.k8s.io/binary-version: "1.29" - coordination.k8s.io/can-lead-leases: kube-system/sample-controller + coordination.k8s.io/can-lead-leases: kube-system/some-custom-controller coordination.k8s.io/compatibility-version: "1.29" - name: sample-controller-0001A + name: some-custom-controller-0001A namespace: kube-system spec: - holderIdentity: sample-controller-0001A + holderIdentity: some-custom-controller-0001A leaseDurationSeconds: 10 renewTime: "2023-12-05T02:33:08.685777Z" ``` @@ -284,7 +284,7 @@ kind: Lease metadata: annotations: coordination.k8s.io/elected-by: coordinated-election-controller - name: sample-controller + name: some-custom-controller namespace: kube-system spec: holderIdentity: controller-a diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml b/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml index f7c6fa1cbe2..ec7be8f00bb 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/kep.yaml @@ -4,6 +4,7 @@ authors: - "@jpbetz" owning-sig: sig-api-machinery participating-sigs: + - sig-cluster-lifecycle status: provisional creation-date: 2023-14-05 reviewers: From 28e2e7514e9ee36e8cbd88f8d554a79ce45686a3 Mon Sep 17 00:00:00 2001 From: Joe Betz Date: Mon, 29 Jan 2024 12:42:13 -0500 Subject: [PATCH 07/13] Update TOC --- .../4355-coordinated-leader-election/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index e4ef6c92912..e6ea299d7d3 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -79,7 +79,7 @@ SIG Architecture for cross-cutting KEPs). - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) - [Risks and Mitigations](#risks-and-mitigations) - [Risk: Amount of writes performed by leader election increases substantially](#risk-amount-of-writes-performed-by-leader-election-increases-substantially) - - [Risk: leases watches increases apiserver load substantially](#risk-leases-watches-increases-apiserver-load-substantially) + - [Risk: lease watches increase apiserver load substantially](#risk-lease-watches-increase-apiserver-load-substantially) - [Risk: We have to "start over" and build confidence in a new leader election algorithm](#risk-we-have-to-start-over-and-build-confidence-in-a-new-leader-election-algorithm) - [Risk: How is the election controller elected?](#risk-how-is-the-election-controller-elected) - [Risk: What if the election controller fails to elect a leader?](#risk-what-if-the-election-controller-fails-to-elect-a-leader) From 1176c9528961900dfd72689b80089d153a26c749 Mon Sep 17 00:00:00 2001 From: Jefftree Date: Thu, 1 Feb 2024 11:48:17 -0500 Subject: [PATCH 08/13] Add changes to coordinated leader election --- .../README.md | 264 +++++++++++------- .../4355-coordinated-leader-election/kep.yaml | 1 + 2 files changed, 169 insertions(+), 96 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index e6ea299d7d3..706e225626b 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -84,7 +84,6 @@ SIG Architecture for cross-cutting KEPs). - [Risk: How is the election controller elected?](#risk-how-is-the-election-controller-elected) - [Risk: What if the election controller fails to elect a leader?](#risk-what-if-the-election-controller-fails-to-elect-a-leader) - [Design Details](#design-details) - - [Running the Coordinated Leader Election in the APIServer](#running-the-coordinated-leader-election-in-the-apiserver) - [Test Plan](#test-plan) - [Prerequisite testing updates](#prerequisite-testing-updates) - [Unit tests](#unit-tests) @@ -103,6 +102,10 @@ SIG Architecture for cross-cutting KEPs). - [Implementation History](#implementation-history) - [Drawbacks](#drawbacks) - [Alternatives](#alternatives) + - [Similar approaches involving the leader election controller](#similar-approaches-involving-the-leader-election-controller) + - [Running the leader election controller in HA on every apiserver](#running-the-leader-election-controller-in-ha-on-every-apiserver) + - [Running the coordinated leader election controller in KCM](#running-the-coordinated-leader-election-controller-in-kcm) + - [Running the coordinated leader election controller in a new container](#running-the-coordinated-leader-election-controller-in-a-new-container) - [Component instances pick a leader without a coordinator](#component-instances-pick-a-leader-without-a-coordinator) - [Component instances pick a leader without identity leases or a coordinator](#component-instances-pick-a-leader-without-identity-leases-or-a-coordinator) - [Future Work](#future-work) @@ -125,22 +128,31 @@ Check these off as they are completed for the Release Team to track. These checklist items _must_ be updated for the enhancement to be released. --> -Items marked with (R) are required *prior to targeting to a milestone / release*. +Items marked with (R) are required *prior to targeting to a milestone / +release*. -- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in + [kubernetes/enhancements] (not the initial KEP PR) - [ ] (R) KEP approvers have approved the KEP status as `implementable` - [ ] (R) Design details are appropriately documented -- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and + SIG Testing input (including test refactors) - [ ] e2e Tests for all Beta API Operations (endpoints) - - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance + Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free - [ ] (R) Graduation criteria is in place - - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) [all GA + Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by + [Conformance + Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) - [ ] (R) Production readiness review completed - [ ] (R) Production readiness review approved - [ ] "Implementation History" section is up-to-date for milestone -- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] -- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes +- [ ] User-facing documentation has been created in [kubernetes/website], for + publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to + mailing list discussions/SIG meetings, relevant PRs/issues, release notes [x] I/we understand the owners of the involved components may require updates to -existing tests to make this code solid enough prior to committing the changes necessary -to implement this enhancement. +existing tests to make this code solid enough prior to committing the changes +necessary to implement this enhancement. ##### Prerequisite testing updates @@ -531,7 +538,8 @@ This can inform certain test coverage improvements that we want to do before extending the production code to implement this enhancement. --> -- `staging/src/k8s.io/client-go/tools/leaderelection`: `TODO` - `client-go coordinated leader election tests` +- `staging/src/k8s.io/client-go/tools/leaderelection`: `TODO` - `client-go + coordinated leader election tests` - `pkg/controller/leaderelection`: `TODO` - `new controller tests` - ``: `` - `` @@ -656,8 +664,8 @@ enhancement: The feature uses leases in a standard way, so if some components instances are configured to use the old direct leases and others are configured to use this -enhancement's coordinated leases, the component instances may still safely -share the same lease, and leaders will be safely elected. +enhancement's coordinated leases, the component instances may still safely share +the same lease, and leaders will be safely elected. @@ -131,11 +141,11 @@ checklist items _must_ be updated for the enhancement to be released. Items marked with (R) are required *prior to targeting to a milestone / release*. -- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in +- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) -- [ ] (R) KEP approvers have approved the KEP status as `implementable` -- [ ] (R) Design details are appropriately documented -- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and +- [x] (R) KEP approvers have approved the KEP status as `implementable` +- [x] (R) Design details are appropriately documented +- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) - [ ] e2e Tests for all Beta API Operations (endpoints) - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance @@ -146,8 +156,8 @@ release*. Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) -- [ ] (R) Production readiness review completed -- [ ] (R) Production readiness review approved +- [x] (R) Production readiness review completed +- [x] (R) Production readiness review approved - [ ] "Implementation History" section is up-to-date for milestone - [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] @@ -225,7 +235,7 @@ Leader elected components: The control plane: - Can safely canary components and nodes at the new version for an extended - period of time, or to pause an upgrade at any step durning an upgrade. This + period of time, or to pause an upgrade at any step during an upgrade. This enhancement, combined with [UVIP](../4020-unknown-version-interoperability-proxy) helps achieve this. @@ -238,49 +248,51 @@ The control plane: - Offer an opt-in leader election mechanism to: - Elect the candidate with the oldest version available. - - Provide a way to preempt the current leader. + - Provide a way to preempt the current leader on the upcoming expiry of the term. - Reuse the existing lease mechanism as much as possible. -### Component Identity Leases +### Component Lease Candidates -Components will create identity leases similar to those used by apiserver -identity, e.g.: +Components will create lease candidates similar to those used by apiserver +identity. Some key differences are certain fields like `LeaseTransitions` and `HolderIdentity` are removed. +See the API section for the full API. + + e.g.: ```yaml apiVersion: coordination.k8s.io/v1 -kind: Lease +kind: LeaseCandidate metadata: - annotations: - coordination.k8s.io/binary-version: "1.29" - coordination.k8s.io/can-lead-leases: kube-system/some-custom-controller - coordination.k8s.io/compatibility-version: "1.29" + labels: + binary-version: "1.29" + compatibility-version: "1.29" name: some-custom-controller-0001A namespace: kube-system spec: - holderIdentity: some-custom-controller-0001A - leaseDurationSeconds: 10 + canLeadLease: kube-system/some-custom-controller + leaseDurationSeconds: 300 renewTime: "2023-12-05T02:33:08.685777Z" ``` -A component identity lease announces candidacy for leadership by including -`coordination.k8s.io/can-lead-leases` it is identity lease. If the lease -expires, the component is considered unavailable for leader election purposes. +A component "lease candidate" announces candidacy for leadership by specifying +`spec.canLeadLease` in its lease candidate lease. If the LeaseCandidate object expires, the +component is considered unavailable for leader election purposes. "Expires" is defined more clearly in the Renewal Interval section. ### Coordinated Election Controller -A new Coordinated Election Controller will reconcile component Leader Leases -(primary resource) and Identity Leases (secondary resource, changes trigger +A new Coordinated Election Controller will reconcile component leader `Lease`s +(primary resource) and Lease Candidate Leases (secondary resource, changes trigger reconciliation of related leader leases). Coordinated Election Controller reconciliation loop: - If no leader lease exists for a components: - - Elect leader from candidates by preparing a freshly renewed lease with: + - Elect leader from candidates by preparing a freshly renewed `Lease` with: - `spec.holderIdentity` set to the identity of the elected leader - `coordination.k8s.io/elected-by: leader-election-controller` (to make lease types easy to disambiguate) - If there is a better candidate than current leader: - - Sets `coordination.k8s.io/end-of-term: true` on the leader lease, signaling + - Sets `endofterm: true` on the leader `Lease`, signaling that the leader should stop renewing the lease and yield leadership ```mermaid @@ -326,26 +338,98 @@ option. A new `resourceLock` type of `coordinatedleases`, and `CoordinatedLeaseLock` implementation of `resourcelock.Interface` will be added to client-go that: -- Creates Identity Lease when ready to be Leader - Renews identity lease - periodically +- Creates LeaseCandidate Lease when ready to be Leader +- Renews LeaseCandidate lease infrequently (once every 300 seconds) +- Watches its LeaseCandidate lease for the `coordination.k8s.io/pending-ack` annotation and updates to remove it. When the annotation is removed, the `renewTime` is subsequently updated. + - Watches Leader Lease, waiting to be elected leader by the Coordinated Election Controller - When it becomes leader: - Perform role of active component instance - Renew leader lease periodically - - Stop renewing if lease is marked `coordination.k8s.io/end-of-term: true` + - Stop renewing if lease is marked `spec.endOfTerm: true` - If leader lease expires: - Shutdown (yielding leadership) and restart as a candidate component instance ```mermaid flowchart TD - A[Started] -->|Create Identity Lease| B + A[Started] -->|Create LeaseCandidate Lease| B B[Candidate] --> |Elected| C[Leader] C --> |Renew Leader Lease| C C -->|End of Term / Leader Lease Expired| D[Shutdown] D[Shutdown] -.-> |Restart| A ``` +### Renewal Interval and Performance +The leader lease will have renewal interval and duration (2s and 15s). This is similar to the renewal interval of the current leader lease. + +For component leases, keeping a short renewal interval will add many unnecessary writes to the apiserver. +The component leases renewal interval will default to 5 mins. + +When the leader lease is marked as end of term or available, the coordinated leader election controller will +add an annotation to all component lease candidate objects (`coordination.k8s.io/pending-ack`) and wait up to 5 seconds. +During that time, components must update their component lease to remove the annotation. +The leader election controller will then pick the leader based on its criteria from the set of component leases that have ack'd the request. + +### Strategy + +There are cases where a user may want to change the leader election algorithm +and this can be done via the `spec.Strategy` field in a Lease. + +The `Strategy` field signals to the coordinated leader election controller the +appropriate algorithm to use when selecting leaders. + +We will allow the Coordinated Leader Election controller to create a Lease +without a holder. The `Lease` may be updated by a third party to the desired +`spec.Strategy`. The strategy will always default to +`MinimumCompatibilityVersion`. + +#### Alternative for Strategy + +##### Creating a new LeaseConfiguration resource + +We can create a new resource `LeaseConfiguration` to set up the defaults for +`Strategy` and other configurations extensible in the future. This is a very +clean approach that allows users to change the strategy at will without needing +to recompile/restart anything. The main drawback is the introduction of a new +resource and more complexity in leader election logic and watching. + +```yaml +kind: LeaseConfiguration +spec: + targetLease: "kube-system/kube-controller-manager" + strategy: "MinimumCompatibilityVersion" +``` + +##### YAML/CLI configuration on the kube-apiserver + +We can also populate the default by directly setting up the CLE controller to ingest the proper defaults. +For instance, ingesting a YAML configuration in the form of a list of KV pairs of `lease:strategy` pairs will allow the CLE controller to directly determine the `Strategy` used for each component. This has the added benefit of requiring no API changes as it is optional whether to include the strategy in the `Lease` object. + +The drawback of this method is that elevated permissions are needed to configure the kube-apiserver. In addition, an apiserver restart may be needed when the `Strategy` needs to be changed. + +##### Strategy propagated from LeaseCandidate + +One other alternative is that Strategy could be an option specified by a +`LeaseCandidate` object, in most cases the controller responsible renewing the +`LeaseCandidate` lease. The value for the strategy between different +`LeaseCandidate` objects leading the same `Lease` should be the same, but during +mixed version states, there is a possibility that they may differ. We will use a +consensus protocol that favors the algorithm with the highest priority. The +priority is a fixed list that is predetermined. For now, this is +`NoCoordination` > `MinimumCompatibilityVersion`. For example, if three +`LeaseCandidate` objects exist and two objects select +`MinimumCompatibilityVersion` while the third selects `NoCoordination`, +`NoCoordination` will take precedent and the coordinated leader election +controller will use `NoCoordination` as the election strategy. The final +strategy used will be written to the `Lease` object when the CLE controller +creates the `Lease` for a suitable leader. This has the benefit of providing +better debugging information and allows short circuiting of an election if the +set of candidates and selected strategy is the same as before. + +The obvious drawback is the need for a consensus protocol and extra information +in the `LeaseCandidate` object that may be unnecessary. + ### Enabling on a component Components with a `--leader-elect-resource-lock` flag (kube-controller-manager, @@ -360,12 +444,12 @@ Coordinated Leader Election (or vis-versa). During the upgrade, a mix of components will be running both election approaches. When the leader lease expires, there are a couple possibilities: -- A controller instance using Lease Based Leader Election claims the leader +- A controller instance using `Lease`-based leader election claims the leader lease - The coordinated election controller picks a leader, from the components that - have written identity leases, and claims the lease on the leader's behalf + have written LeaseCandidate leases, and claims the lease on the leader's behalf -Both possibilities have acceptable outcomes during the migration-- a component +Both possibilities have acceptable outcomes during the migration: a component is elected leader, and once elected, remains leader so long as it keeps the lease renewed. The elected leader might not be the leader that Coordinated Leader Election would pick, but this is no worse than how leader election works @@ -381,6 +465,88 @@ annotations anyway, so this isn't strictly needed, but it would reduce writes from the coordinated election controller to leases that were claimed by component instances not using Coordinated Leader Election +### API + +The lease lock API will be extended with a new field for election preference, denoted as an enum for strategies for Coordinated Leader Election. + +```go + +type CoordinatedLeaseStrategy string + +// CoordinatedLeaseStrategy defines the strategy for picking the leader for coordinated leader election. +const ( + OldestCompatibilityVersion CoordinatedStrategy = "OldestCompatibilityVersion" + NoCoordination CoordinatedStrategy = "NoCoordination" +) + +type LeaseSpec struct { + // Strategy indicates the strategy for picking the leader for coordinated leader election + // This is filled in from LeaseCandidate.Spec.Strategy or defaulted to NoCoordinationStrategy + // if the leader was not elected by the CLE controller. + Strategy CoordinatedLeaseStrategy `json:"strategy,omitempty" protobuf:"string,6,opt,name=strategy"` + + // EndofTerm signals to a lease holder that the lease should not be + // renewed because a better candidate is available. + EndOfTerm bool `json:"endOfTerm,omitempty" protobuf:"boolean,7,opt,name=endOfTerm"` + + // EXISTING FIELDS BELOW + + // holderIdentity contains the identity of the holder of a current lease. + // +optional + HolderIdentity *string `json:"holderIdentity,omitempty" protobuf:"bytes,1,opt,name=holderIdentity"` + // leaseDurationSeconds is a duration that candidates for a lease need + // to wait to force acquire it. This is measure against time of last + // observed renewTime. + // +optional + LeaseDurationSeconds *int32 `json:"leaseDurationSeconds,omitempty" protobuf:"varint,2,opt,name=leaseDurationSeconds"` + // acquireTime is a time when the current lease was acquired. + // +optional + AcquireTime *metav1.MicroTime `json:"acquireTime,omitempty" protobuf:"bytes,3,opt,name=acquireTime"` + // renewTime is a time when the current holder of a lease has last + // updated the lease. + // +optional + RenewTime *metav1.MicroTime `json:"renewTime,omitempty" protobuf:"bytes,4,opt,name=renewTime"` + // leaseTransitions is the number of transitions of a lease between + // holders. + // +optional + LeaseTransitions *int32 `json:"leaseTransitions,omitempty" protobuf:"varint,5,opt,name=leaseTransitions"` +} +``` + +For the LeaseCandidate leases, a new lease will be created + +```go +type LeaseCandidateSpec struct { + // The fields BinaryVersion and CompatibilityVersion will be mandatory labels instead of fields in the spec + + // CanLeadLease is in the format /, indicating the namespace and name of the lease that the candidate may lead + CanLeadLease string + + // Strategy indicates the preferred strategy for the coordinated leader election controller to use. + Strategy CoordinatedLeaseStrategy `json:"strategy,omitempty" protobuf:"string,6,opt,name=strategy"` + + // FIELDS DUPLICATED FROM LEASE + + // leaseDurationSeconds is a duration that candidates for a lease need + // to wait to force acquire it. This is measure against time of last + // observed renewTime. + // +optional + LeaseDurationSeconds *int32 `json:"leaseDurationSeconds,omitempty" protobuf:"varint,2,opt,name=leaseDurationSeconds"` + // renewTime is a time when the current holder of a lease has last + // updated the lease. + // +optional + RenewTime *metav1.MicroTime `json:"renewTime,omitempty" protobuf:"bytes,4,opt,name=renewTime"` +} +``` + +Each LeaseCandidate lease may only lead one lock. If the same component wishes to lead many leases, +a separate LeaseCandidate lease will be required for each lock. + +If the `LeaseCandidate` objects do not agree on a value for the Strategy, we will have an ordering priority. +For instance, we define `NewestCompatibilityVersion` > `OldestCompatibilityVersion`. This means that if +a subset of candidates have `OldestCompatibilityVersion` and and subset have `NewestCompatibilityVersion`, +coordinated leader election will pick `NewestCompatibilityVersion`. In order for `OldestCompatibilityVersion` to be used, +all `LeaseCandidate` objects must publish the same `Strategy`. ### Comparison of leader election @@ -402,7 +568,7 @@ expecting version skew to be respected. - When the first and second nodes are upgraded, any components that were leaders will typically lose the lease during the node downtime - If one happens to retain its lease, it will be preempted by the coordinated - election controller after it updates its identity lease with new version + election controller after it updates its LeaseCandidate lease with new version information - When the third node is upgraded, all components will be at the new version and one will be elected @@ -414,13 +580,21 @@ expecting version skew to be respected. - When the first node is rolled back, any components that were leaders will typically loose the lease during the node downtime -- Once one of the components updates its identity lease with new version +- Once one of the components updates its LeaseCandidate lease with new version information, the coordinated election controller will preempt the current leader so that this lower version component becomes leader. - When the remaining two nodes can rollback, the first node will typically remain leader, but if a new election occurs, the available older version components will be elected. +#### Story 3 + +A cluster administrator may want more fine grain control over a control plane's upgrade. + +- When one node is upgraded they may wish to canary the components on that + node and switch the leader to the new compatibility version immediately. +- This can be accomplished by changing the `Strategy` field in a lease object. + ### Notes/Constraints/Caveats (Optional) -- `staging/src/k8s.io/client-go/tools/leaderelection`: `TODO` - `client-go - coordinated leader election tests` +- `staging/src/k8s.io/client-go/tools/leaderelection`: 76.8 - `pkg/controller/leaderelection`: `TODO` - `new controller tests` -- ``: `` - `` ##### Integration tests @@ -560,8 +732,7 @@ For Beta and GA, add links to added tests together with links to k8s-triage for https://storage.googleapis.com/k8s-triage/index.html --> -- `test/integration/coordinatedleaderelection`: TODO -- : +- `test/integration/apiserver/coordinatedleaderelection`: New file ##### e2e tests @@ -575,8 +746,7 @@ https://storage.googleapis.com/k8s-triage/index.html We expect no non-infra related flakes in the last month as a GA graduation criteria. --> -- `test/e2e/apimachinery/coordinatedleaderelection.go`: TODO -- : +- `test/e2e/apimachinery/coordinatedleaderelection.go`: New file ### Graduation Criteria @@ -642,6 +812,10 @@ in back-to-back releases. - Deprecate the flag --> +#### Alpha +- Feature implemented behind a feature flag +- The strategy `MinimumCompatibilityVersionStrategy` is implemented + ### Upgrade / Downgrade Strategy If the `--leader-elect-resource-lock=coordinatedleases` flag is set and a @@ -723,8 +897,9 @@ well as the [existing list] of feature gates. --> - [x] Feature gate (also fill in values in `kep.yaml`) - - Feature gate name: + - Feature gate name: CoordinatedLeaderElection - Components depending on the feature gate: + - kube-apiserver - kube-controller-manager - kube-scheduler - [ ] Other @@ -1116,8 +1291,8 @@ to avoid. Instead of running in KCM, the coordinated leader election controller could be run in a new container (eg: `kube-coordinated-leader-election`). There will be a -large memory footprint with this approach and adding a new component to the -control plane could change our Kubernetes architecture in an undesirable way. +slightly larger memory footprint with this approach and adding a new component to the +control plane changes our Kubernetes control plane topology in an undesirable way. ### Component instances pick a leader without a coordinator @@ -1125,7 +1300,7 @@ control plane could change our Kubernetes architecture in an undesirable way. coordinator picks the leader: - Components race to claim the lease - If a component claims the lease, the first thing it does is check the - identity leases to see if there is a better leader + lease candidates to see if there is a better leader - If it finds a better lease, it assigns the lease to that component instead of itself @@ -1133,12 +1308,10 @@ Pros: - No coordinated election controller Cons: - - - All component instances must watch the identity leases - - All components must have the code to decide which component is the best + - All leader elected components must have the code to decide which component is the best leader -### Component instances pick a leader without identity leases or a coordinator +### Component instances pick a leader without lease candidates or a coordinator - The candidates communicate through the lease to agree on the leader - Leases have "Election" and "Term" states @@ -1154,7 +1327,7 @@ Cons: Pros: - No coordinated election controller -- No identity leases +- No lease candidates Cons: @@ -1163,6 +1336,22 @@ Cons: in the ecosystem with the bug must upgrade client-go and release to be fixed. - More difficult to change/customize the criteria for which candidate is best. +### Algorithm configurability + +We've opted for a static fixed algorithm that looks at three things, continuing +down the list of comparisons if there is a tiebreaker. + +- min(binary version) +- min(compatibility version) +- min(lease candidate name) + +The goal of the KEP is to make the leader predictable during a cluster upgrade where +leader elected components and apiservers may have mixed versions. This will make +all states of a Kubernetes control plane upgrade adhere to the version skew policy. + +An alternative is to make the leader election algorithm configurable either via flags +or a configuration file. + ## Future Work - Controller sharding could leverage coordinated leader election to load balance From 2984c452a0440e3402664228c3a618347adc2d9d Mon Sep 17 00:00:00 2001 From: Jefftree Date: Wed, 5 Jun 2024 16:42:22 +0900 Subject: [PATCH 10/13] Update to CLE KEP for additional details on Strategy --- .../4355-coordinated-leader-election/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index 929df238a0a..3dc65ea3e95 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -380,9 +380,11 @@ The `Strategy` field signals to the coordinated leader election controller the appropriate algorithm to use when selecting leaders. We will allow the Coordinated Leader Election controller to create a Lease -without a holder. The `Lease` may be updated by a third party to the desired -`spec.Strategy`. The strategy will always default to -`MinimumCompatibilityVersion`. +without a holder. If there are no candidate objects, the `Strategy` field will remain +empty to indicate that the `Lease` is not managed by the CLE controller. +Otherwise the strategy will always default to `MinimumCompatibilityVersion`. +The `Lease` may also be updated by a third party to the desired +`spec.Strategy` if an alternate strategy is preferred. This may be done either by the candidates, users, or additional controllers. #### Alternative for Strategy From 1732f4a6566ba889bdb03fe84318a3f3c2f1afa0 Mon Sep 17 00:00:00 2001 From: Jefftree Date: Wed, 5 Jun 2024 16:44:36 +0900 Subject: [PATCH 11/13] Fix namespace for canleadlease --- .../4355-coordinated-leader-election/README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index 3dc65ea3e95..73bc1c01146 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -269,7 +269,7 @@ metadata: name: some-custom-controller-0001A namespace: kube-system spec: - canLeadLease: kube-system/some-custom-controller + canLeadLease: some-custom-controller leaseDurationSeconds: 300 renewTime: "2023-12-05T02:33:08.685777Z" ``` @@ -380,11 +380,12 @@ The `Strategy` field signals to the coordinated leader election controller the appropriate algorithm to use when selecting leaders. We will allow the Coordinated Leader Election controller to create a Lease -without a holder. If there are no candidate objects, the `Strategy` field will remain -empty to indicate that the `Lease` is not managed by the CLE controller. -Otherwise the strategy will always default to `MinimumCompatibilityVersion`. -The `Lease` may also be updated by a third party to the desired -`spec.Strategy` if an alternate strategy is preferred. This may be done either by the candidates, users, or additional controllers. +without a holder. If there are no candidate objects, the `Strategy` field will +remain empty to indicate that the `Lease` is not managed by the CLE controller. +Otherwise the strategy will always default to `MinimumCompatibilityVersion`. The +`Lease` may also be updated by a third party to the desired `spec.Strategy` if +an alternate strategy is preferred. This may be done either by the candidates, +users, or additional controllers. #### Alternative for Strategy @@ -521,7 +522,7 @@ For the LeaseCandidate leases, a new lease will be created type LeaseCandidateSpec struct { // The fields BinaryVersion and CompatibilityVersion will be mandatory labels instead of fields in the spec - // CanLeadLease is in the format /, indicating the namespace and name of the lease that the candidate may lead + // CanLeadLease indicates the name of the lease that the candidate may lead CanLeadLease string // Strategy indicates the preferred strategy for the coordinated leader election controller to use. From 50641e837a74bf51624871798e82304916f2d0f8 Mon Sep 17 00:00:00 2001 From: Jefftree Date: Thu, 6 Jun 2024 15:49:27 +0900 Subject: [PATCH 12/13] Add PRR --- keps/prod-readiness/sig-api-machinery/4355.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 keps/prod-readiness/sig-api-machinery/4355.yaml diff --git a/keps/prod-readiness/sig-api-machinery/4355.yaml b/keps/prod-readiness/sig-api-machinery/4355.yaml new file mode 100644 index 00000000000..3ff5451a718 --- /dev/null +++ b/keps/prod-readiness/sig-api-machinery/4355.yaml @@ -0,0 +1,3 @@ +kep-number: 4355 +alpha: + approver: "@soltysh" From 5b1ebace0f79110dc7a3d06fa4328c2497880285 Mon Sep 17 00:00:00 2001 From: Jefftree Date: Thu, 6 Jun 2024 18:10:47 +0900 Subject: [PATCH 13/13] Update leaderless lease info --- .../README.md | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md index 73bc1c01146..31a2d9d5c61 100644 --- a/keps/sig-api-machinery/4355-coordinated-leader-election/README.md +++ b/keps/sig-api-machinery/4355-coordinated-leader-election/README.md @@ -379,13 +379,19 @@ and this can be done via the `spec.Strategy` field in a Lease. The `Strategy` field signals to the coordinated leader election controller the appropriate algorithm to use when selecting leaders. -We will allow the Coordinated Leader Election controller to create a Lease -without a holder. If there are no candidate objects, the `Strategy` field will -remain empty to indicate that the `Lease` is not managed by the CLE controller. -Otherwise the strategy will always default to `MinimumCompatibilityVersion`. The -`Lease` may also be updated by a third party to the desired `spec.Strategy` if -an alternate strategy is preferred. This may be done either by the candidates, -users, or additional controllers. +We will allow for the existence of a lease without a holder. This will allow +`Strategy` to be injected and preserved for leases that may not want to use the +default selected by CLE. If there are no candidate objects, the `Strategy` field +will remain empty to indicate that the `Lease` is not managed by the CLE +controller. Otherwise the strategy will always default to +`MinimumCompatibilityVersion`. The `Lease` may also be created or updated by a +third party to the desired `spec.Strategy` if an alternate strategy is +preferred. This may be done either by the candidates, users, or additional +controllers. + +Releasing a `Lease` will involve resetting the holderIdentity to `nil` instead +of deletion. This will preserve `Strategy` when a `Lease` object is released and +reacquired by another candidate. #### Alternative for Strategy @@ -525,9 +531,6 @@ type LeaseCandidateSpec struct { // CanLeadLease indicates the name of the lease that the candidate may lead CanLeadLease string - // Strategy indicates the preferred strategy for the coordinated leader election controller to use. - Strategy CoordinatedLeaseStrategy `json:"strategy,omitempty" protobuf:"string,6,opt,name=strategy"` - // FIELDS DUPLICATED FROM LEASE // leaseDurationSeconds is a duration that candidates for a lease need @@ -545,12 +548,6 @@ type LeaseCandidateSpec struct { Each LeaseCandidate lease may only lead one lock. If the same component wishes to lead many leases, a separate LeaseCandidate lease will be required for each lock. -If the `LeaseCandidate` objects do not agree on a value for the Strategy, we will have an ordering priority. -For instance, we define `NewestCompatibilityVersion` > `OldestCompatibilityVersion`. This means that if -a subset of candidates have `OldestCompatibilityVersion` and and subset have `NewestCompatibilityVersion`, -coordinated leader election will pick `NewestCompatibilityVersion`. In order for `OldestCompatibilityVersion` to be used, -all `LeaseCandidate` objects must publish the same `Strategy`. - ### Comparison of leader election | | Lease Based Leader Election | Coordinated Leader Election |