diff --git a/keps/prod-readiness/sig-node/3570.yaml b/keps/prod-readiness/sig-node/3570.yaml new file mode 100644 index 00000000000..9ae9af0b9b3 --- /dev/null +++ b/keps/prod-readiness/sig-node/3570.yaml @@ -0,0 +1,7 @@ +kep-number: 3570 +alpha: + approver: "@deads2k" +beta: + approver: "@deads2k" +stable: + approver: "@deads2k" diff --git a/keps/sig-node/375-cpu-manager/README.md b/keps/sig-node/3570-cpumanager/README.md similarity index 51% rename from keps/sig-node/375-cpu-manager/README.md rename to keps/sig-node/3570-cpumanager/README.md index e5c5d27d3e1..affe54c7a98 100644 --- a/keps/sig-node/375-cpu-manager/README.md +++ b/keps/sig-node/3570-cpumanager/README.md @@ -1,43 +1,113 @@ # CPU Manager -_Authors:_ - -* @ConnorDoyle - Connor Doyle <connor.p.doyle@intel.com> -* @flyingcougar - Szymon Scharmach <szymon.scharmach@intel.com> -* @sjenning - Seth Jennings <sjenning@redhat.com> - -## Table of Contents - -- [Overview](#overview) - - [Related issues](#related-issues) -- [Proposed changes](#proposed-changes) - - [CPU Manager component](#cpu-manager-component) - - [Discovering CPU topology](#discovering-cpu-topology) - - [CPU Manager interfaces (sketch)](#cpu-manager-interfaces-sketch) - - [Configuring the CPU Manager](#configuring-the-cpu-manager) +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1 : High-performance applications](#story-1--high-performance-applications) + - [Story 2 : KubeVirt](#story-2--kubevirt) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Discovering CPU topology](#discovering-cpu-topology) + - [CPU Manager interfaces (sketch)](#cpu-manager-interfaces-sketch) + - [Configuring the CPU Manager](#configuring-the-cpu-manager) - [Policy 1: "none" cpuset control [default]](#policy-1-none-cpuset-control-default) - [Policy 2: "static" cpuset control](#policy-2-static-cpuset-control) + - [CPU Manager options](#cpu-manager-options) - [Implementation sketch](#implementation-sketch) - [Example pod specs and interpretation](#example-pod-specs-and-interpretation) - [Example scenarios and interactions](#example-scenarios-and-interactions) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Deprecation](#deprecation) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Proposed and not implemented items](#proposed-and-not-implemented-items) - [Policy 3: "dynamic" cpuset control](#policy-3-dynamic-cpuset-control) - [Implementation sketch](#implementation-sketch-1) - - [Example pod specs and interpretation](#example-pod-specs-and-interpretation-1) -- [Operations and observability](#operations-and-observability) -- [Practical challenges](#practical-challenges) -- [Implementation roadmap](#implementation-roadmap) - - [Phase 1: None policy [TARGET: Kubernetes v1.8]](#phase-1-none-policy-target-kubernetes-v18) - - [Phase 2: Static policy [TARGET: Kubernetes v1.8]](#phase-2-static-policy-target-kubernetes-v18) - - [Phase 3: Beta support [TARGET: Kubernetes v1.9]](#phase-3-beta-support-target-kubernetes-v19) - - [Later phases [TARGET: After Kubernetes v1.9]](#later-phases-target-after-kubernetes-v19) -- [Appendix A: cpuset pitfalls](#appendix-a-cpuset-pitfalls) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) +- [Appendixes](#appendixes) + - [related issues](#related-issues) + - [Operations and observability](#operations-and-observability) + - [Practical challenges](#practical-challenges) + - [Original implementation roadmap](#original-implementation-roadmap) + - [Phase 1: None policy [TARGET: Kubernetes v1.8]](#phase-1-none-policy-target-kubernetes-v18) + - [Phase 2: Static policy [TARGET: Kubernetes v1.8]](#phase-2-static-policy-target-kubernetes-v18) + - [Phase 3: Beta support [TARGET: Kubernetes v1.9]](#phase-3-beta-support-target-kubernetes-v19) + - [Later phases [TARGET: After Kubernetes v1.9]](#later-phases-target-after-kubernetes-v19) + - [cpuset pitfalls](#cpuset-pitfalls) +## Release Signoff Checklist + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [X] (R) KEP approvers have approved the KEP status as `implementable` +- [X] (R) Design details are appropriately documented +- [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [X] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [X] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [X] "Implementation History" section is up-to-date for milestone +- [X] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + -## Overview +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website -_Problems to solve:_ +## Summary + +The *CPU Manager* is a new software component in Kubelet responsible for +assigning pod containers to sets of CPUs on the local node. In later +phases, the scope will expand to include caches, a critical shared +processor resource. + +The kuberuntime notifies the CPU manager when containers come and +go. The first such notification occurs in between the container runtime +interface calls to create and start the container. The second notification +occurs after the container is stopped by the container runtime. The CPU +Manager writes CPU settings for containers using a new CRI method named +[`UpdateContainerResources`](https://github.com/kubernetes/kubernetes/pull/46105). +This new method is invoked from two places in the CPU manager: during each +call to `AddContainer` and also periodically from a separate +reconciliation loop. + +This KEP supersedes and replaces `kubernetes/enhancements/keps/sig-node/375-cpumanager/README.md`. + +## Motivation 1. Poor or unpredictable performance observed compared to virtual machine based orchestration systems. Application latency and lower CPU @@ -48,7 +118,7 @@ _Problems to solve:_ for “fast” virtual network functions (want to approach line rate on modern server NICs.) -_Solution requirements:_ +### Goals 1. Provide an API-driven contract from the system to a user: "if you are a Guaranteed pod with 1 or more cores of cpu, the system will try to make @@ -61,31 +131,11 @@ _Solution requirements:_ exclusive cores, since that would be antithetical to (1) above. 1. Take physical processor topology into account in the CPU affinity policy. -### Related issues - -* Feature: [Further differentiate performance characteristics associated - with pod level QoS](https://github.com/kubernetes/features/issues/276) -* Feature: [Add CPU Manager for pod cpuset - assignment](https://github.com/kubernetes/features/issues/375) - -## Proposed changes +### Non-Goals -### CPU Manager component - -The *CPU Manager* is a new software component in Kubelet responsible for -assigning pod containers to sets of CPUs on the local node. In later -phases, the scope will expand to include caches, a critical shared -processor resource. +N/A -The kuberuntime notifies the CPU manager when containers come and -go. The first such notification occurs in between the container runtime -interface calls to create and start the container. The second notification -occurs after the container is stopped by the container runtime. The CPU -Manager writes CPU settings for containers using a new CRI method named -[`UpdateContainerResources`](https://github.com/kubernetes/kubernetes/pull/46105). -This new method is invoked from two places in the CPU manager: during each -call to `AddContainer` and also periodically from a separate -reconciliation loop. +## Proposal ![cpu-manager-block-diagram](https://user-images.githubusercontent.com/379372/30137651-2352f4f0-9319-11e7-8be7-0aaeb6ce593a.png) @@ -95,7 +145,40 @@ to build and test new policies. The shared state abstraction allows other Kubelet components to be agnostic of the CPU manager policy for observability and checkpointing extensions._ -#### Discovering CPU topology +### User Stories (Optional) + +#### Story 1 : High-performance applications + +Systems such as real-time trading system or 5G CNFs (User Plane Function, UPF) need to maximize the CPU time; CPU pinning ensure exclusive CPU allocation and allows to avoid performance issues due to core switches, cold caches. +NUMA aware allocation of CPUs, provided by CPU manager cooperating with Topology Manager, is also a critical prerequisite for these applications to meet their performance requirement. +The alignment of resources on the same NUMA node, CPUs first and foremost, prevents performance degradation due to inter-node (between NUMA nodes) communication overhead. + +#### Story 2 : KubeVirt + +KubeVirt leverages the CPU pinning provided by CPU manager to assign full CPU cores to vCPUs inside the VM to [enhance performance][kubevirt-cpus]. +[NUMA support for VMs][kubevirt-numa] is also built on top of the CPU pinning and NUMA-aware CPU allocation. + +### Notes/Constraints/Caveats (Optional) + +N/A + +### Risks and Mitigations + +Bugs in cpumanager can cause the kubelet to crash, or workloads to start with incorrect pinning. +This can be mitigated with comprehensive testing and improving the observability of the system +(see metrics). + +While the cpumanager core policy has seen no changes except for bugfixes since a while, +we introduced the [cpumanager options policy framework](https://github.com/fromanirh/enhancements/blob/master/keps/sig-node/2625-cpumanager-policies-thread-placement/README.md) +to enable the fine tuning of the static policy. +This area is more active, so bugs introduced with policy options can cause the kubelet to crash. +To mitigate this risk, we can make sure each policy option can be disabled independently, and +is not coupled with others, avoiding cascading failures or unnecessary coupling. +Graduation and testing criteria are deferred to the KEPs tracking the implementation of these features. + +## Design Details + +### Discovering CPU topology The CPU Manager must understand basic topology. First of all, it must determine the number of logical CPUs (hardware threads) available for @@ -125,7 +208,7 @@ Alternate options considered for discovering topology: 1. Execute a mature external topology program like [`mpi-hwloc`][hwloc] -- potentially adding support for the hwloc file format to the Kubelet. -#### CPU Manager interfaces (sketch) +### CPU Manager interfaces (sketch) ```go type State interface { @@ -156,11 +239,12 @@ type CPUSet map[int]struct{} // set operations and parsing/formatting helpers type CPUTopology // convenient type for querying and filtering CPUs ``` -#### Configuring the CPU Manager +### Configuring the CPU Manager -Kubernetes will ship with three CPU manager policies. Only one policy is +Kubernetes will ship with CPU manager policies. Only one policy is active at a time on a given node, chosen by the operator via Kubelet -configuration. The three policies are **none**, **static** and **dynamic**. +configuration. The policies are **none** and **static**. + The active CPU manager policy is set through a new Kubelet configuration value `--cpu-manager-policy`. The default value is `none`. @@ -216,6 +300,16 @@ application-level CPU affinity of their own, as those settings may be overwritten without notice (whenever exclusive cores are allocated or deallocated.) +#### CPU Manager options + +`CPUManagerPolicyOptions` allow to fine-tune the behavior of the `static` policy. +The details of each option are described in their own KEP. +As for kubernetes 1.26, the following options are available: + +- [full-pcpus-only](https://github.com/fromanirh/enhancements/blob/master/keps/sig-node/2625-cpumanager-policies-thread-placement/README.md) +- [distribute-cpus-across-numa](https://github.com/fromanirh/enhancements/blob/master/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md) +- [align-by-socket](https://github.com/fromanirh/enhancements/blob/master/keps/sig-node/3327-align-by-socket/README.md) + ##### Implementation sketch The static policy maintains the following sets of logical CPUs: @@ -323,6 +417,256 @@ func (p *staticPolicy) RemoveContainer(s State, containerID string) error { `floor(capacity.cpu - allocatable.cpu)` and the shared pool initially contains all CPUs in the system. + +### Test Plan + +[X] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + +##### Unit tests + + + + +- `k8s.io/kubernetes/pkg/kubelet/cm/cpumanager`: `20220929` - `86.2%` + +##### Integration tests + +- N/A + +##### e2e tests + +- `k8s.io/kubernetes/test/e2e_node/cpu_manager_test.go` + +### Graduation Criteria + +#### Alpha + +- Feature implemented behind a feature flag +- Initial e2e tests completed and enabled + +#### Beta + +- Gather feedback from developers and surveys +- Complete features A, B, C +- Additional tests are in Testgrid and linked in KEP + +#### GA + +- N examples of real-world usage +- N installs +- More rigorous forms of testing—e.g., downgrade tests and scalability tests +- Allowing time for feedback + +**Note:** Generally we also wait at least two releases between beta and +GA/stable, because there's no opportunity for user feedback, or even bug reports, +in back-to-back releases. + +**For non-optional features moving to GA, the graduation criteria must include +[conformance tests].** + +[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md + +#### Deprecation + +- Announce deprecation and support policy of the existing flag +- Two versions passed since introducing the functionality that deprecates the flag (to address version skew) +- Address feedback on usage/changed behavior, provided on GitHub issues +- Deprecate the flag + +### Upgrade / Downgrade Strategy + +No impact. It's always possible to trivially downgrade to the previous kubelet + +### Version Skew Strategy + +Not relevant + +## Production Readiness Review Questionnaire + +### Feature Enablement and Rollback + +###### How can this feature be enabled / disabled in a live cluster? + +- [X] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: `CPUManager` + - Components depending on the feature gate: kubelet + +NOTE: in order to enable the feature, the cluster admin needs also to enable +the `static` cpu manager policy. + +###### Does enabling the feature change any default behavior? + +No, unless the non-none policy (`static`) is explicitly configured. + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +Yes, using the kubelet config. + +###### What happens if we reenable the feature if it was previously rolled back? + +The impact is node-local only. +If the state of a node is steady, no changes. +If a guaranteed pod is admitted, running non-guaranteed pods will have their CPU cgroup changed while running. + +###### Are there any tests for feature enablement/disablement? + +Yes, covered by e2e tests + +### Rollout, Upgrade and Rollback Planning + +###### How can a rollout or rollback fail? Can it impact already running workloads? + +A rollout can fail if a bug in the cpumanager prevents _new_ pods to start, or existing pods to be restarted. +Already running workload will not be affected if the node state is steady + +###### What specific metrics should inform a rollback? + +"cpu_manager_pinning_errors_total". It must be noted that even in fully healthy system there are known benign condition +that can cause CPU allocation failures. Few selected examples are: + +- requesting odd numbered cores (not a full physical core) when the cpumanager is configured with the `full-pcpus-only` option +- requesting NUMA-aligned cores, with Topology Manager enabled. + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + +No to both. +Changes in behavior only affects pods meeting the conditions (guaranteed QoS, integral CPU request) scheduler after the upgrade. +Running pods will be unaffected by any change. This offers some degree of safety in both upgrade->rollback +and upgrade->downgrade->upgrade scenarios. + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + +No + +### Monitoring Requirements + +Monitor the metrics +- "cpu_manager_pinning_requests_total" +- "cpu_manager_pinning_errors_total" + +###### How can an operator determine if the feature is in use by workloads? + +In order for pods to request exclusive CPUs allocation and pinning, they need to match +all the following criteria: +- the pod QoS must be "guaranteed" +- the resources request of CPU (`pod.spec.containers[].resources.limits.cpu`) must be integral. + +On top of that, at kubelet level +- the cpumanager policy must be `static`. + +If all the criteria are met, then the feature is in use by workloads. + +###### How can someone using this feature know that it is working for their instance? + +- [X] Other (treat as last resort) + - Details: check the kubelet metric `cpu_manager_pinning_requests_total` + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + +"cpu_manager_pinning_requests_total" and "cpu_manager_pinning_errors_total" +We need to find a careful balance here because we don't want to leak hardware details, or in general informations +dependent on the worker node hardware configuration (example, even if arguable extreme, is the processor core layout). + +It is possible to infer which pod would trigger a CPU pinning from the +[pod resources request](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy) +but adding these two metrics is both very cheap and helping for the observability of the system. + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + +- [X] Metrics + - Metric name: + - cpu_manager_pinning_requests_total + - cpu_manager_pinning_errors_total + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + +- "cpu_manager_pinning_requests_total" +- "cpu_manager_pinning_errors_total" + +The addition of these metrics will be done before moving to GA +([issue](https://github.com/kubernetes/kubernetes/issues/112854), + [PR](https://github.com/kubernetes/kubernetes/pull/112855)). + + +### Dependencies + +None + +###### Does this feature depend on any specific services running in the cluster? + +No + +### Scalability + +###### Will enabling / using this feature result in any new API calls? + +No, the feature is entirely node-local + +###### Will enabling / using this feature result in introducing new API types? + +No, the feature is entirely node-local + +###### Will enabling / using this feature result in any new calls to the cloud provider? + +No, the feature is entirely node-local + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + +No, the feature is entirely node-local + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + +No, the feature is entirely node-local + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + +No + +### Troubleshooting + +###### How does this feature react if the API server and/or etcd is unavailable? + +No impact. The behavior of the feature does not change when API Server and/or etcd is unavailable since the feature is node local. + +###### What are other known failure modes? + +After changing the CPU manager policy from `none` to `static` or the the other way around, before to start the kubelet again, +you must remove the CPU manager state file(`/var/lib/kubelet/cpu_manager_state`), otherwise the kubelet start will fail. +Startup failures for this reason will be logged in the kubelet log. + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + +- **2022-09-29:** kep translated to the most recent template available at time; proposed to GA; added PRR info. + +## Drawbacks + +N/A + +## Alternatives + +### Proposed and not implemented items + #### Policy 3: "dynamic" cpuset control _TODO: Describe the policy._ @@ -336,6 +680,7 @@ project (a subset of) the CPU manager state into a volume visible to selected containers. User workloads could subscribe to update events in a normal Linux manner (e.g. inotify.) + ##### Implementation sketch ```go @@ -352,14 +697,22 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { } ``` -##### Example pod specs and interpretation +## Infrastructure Needed (Optional) -| Pod | Interpretation | -| ------------------------------------------ | ------------------------------ | -| | | -| | | +N/A + +## Appendixes + +Record of information of the original KEP without a clear fit in the latest template + +### related issues + +* feature: [further differentiate performance characteristics associated + with pod level qos](https://github.com/kubernetes/features/issues/276) +* feature: [add cpu manager for pod cpuset + assignment](https://github.com/kubernetes/features/issues/375) -## Operations and observability +### Operations and observability * Checkpointing assignments * The CPU Manager must be able to pick up where it left off in case the @@ -367,7 +720,7 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { * Read effective CPU assignments at runtime for alerting. This could be satisfied by the checkpointing requirement. -## Practical challenges +### Practical challenges 1. Synchronizing CPU Manager state with the container runtime via the CRI. Runc/libcontainer allows container cgroup settings to be updated @@ -381,9 +734,9 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { 1. Mitigation: defer supporting this until a new policy tailored for use with `isolcpus` can be added. -## Implementation roadmap +### Original implementation roadmap -### Phase 1: None policy [TARGET: Kubernetes v1.8] +#### Phase 1: None policy [TARGET: Kubernetes v1.8] * Internal API exists to allocate CPUs to containers ([PR 46105](https://github.com/kubernetes/kubernetes/pull/46105)) @@ -392,7 +745,7 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { * All existing unit and e2e tests pass. * Initial unit tests pass. -### Phase 2: Static policy [TARGET: Kubernetes v1.8] +#### Phase 2: Static policy [TARGET: Kubernetes v1.8] * Kubelet can discover "basic" CPU topology (HT-to-physical-core map) * Static policy is implemented. @@ -401,12 +754,12 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { * Performance metrics for one or more plausible synthetic workloads show benefit over none policy. -### Phase 3: Beta support [TARGET: Kubernetes v1.9] +#### Phase 3: Beta support [TARGET: Kubernetes v1.9] * Container CPU assignments are durable across Kubelet restarts. * Expanded user and operator docs and tutorials. -### Later phases [TARGET: After Kubernetes v1.9] +#### Later phases [TARGET: After Kubernetes v1.9] * Static policy also manages [cache allocation][cat] on supported platforms. * Dynamic policy is implemented. @@ -418,7 +771,7 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { * Node-level coordination for NUMA-dependent resource allocations, for example devices, CPUs, memory-backed volumes including hugepages. -## Appendix A: cpuset pitfalls +### cpuset pitfalls 1. [`cpuset.sched_relax_domain_level`][cpuset-files]. "controls the width of the range of CPUs over which the kernel scheduler performs immediate @@ -436,6 +789,8 @@ func (p *dynamicPolicy) RemoveContainer(s State, containerID string) error { [cat]: http://www.intel.com/content/www/us/en/communications/cache-monitoring-cache-allocation-technologies.html [cpuset-files]: http://man7.org/linux/man-pages/man7/cpuset.7.html#FILES +[kubevirt-cpus]: https://kubevirt.io/user-guide/virtual_machines/dedicated_cpu_resources/ +[kubevirt-numa]: https://kubevirt.io/user-guide/virtual_machines/numa/#preconditions [ht]: http://www.intel.com/content/www/us/en/architecture-and-technology/hyper-threading/hyper-threading-technology.html [hwloc]: https://www.open-mpi.org/projects/hwloc [node-allocatable]: /contributors/design-proposals/node/node-allocatable.md#phase-2---enforce-allocatable-on-pods diff --git a/keps/sig-node/3570-cpumanager/kep.yaml b/keps/sig-node/3570-cpumanager/kep.yaml new file mode 100644 index 00000000000..53a53bf4b3a --- /dev/null +++ b/keps/sig-node/3570-cpumanager/kep.yaml @@ -0,0 +1,51 @@ +title: CPU Manager +kep-number: 3570 +authors: + - "@ConnorDoyle" + - "@flyingcougar" + - "@sjenning" + - "@fromanirh" # ONLY for GA graduation and PRR review +owning-sig: sig-node +participating-sigs: + - sig-node +reviewers: + - "@derekwaynecarr" +approvers: + - "@dawnchen" + - "@derekwaynecarr" +editor: Connor Doyle +creation-date: 2017-05-23 +last-updated: 2022-10-03 +status: implementable +see-also: +replaces: + - "kubernetes/community/contributors/design-proposals/node/cpu-manager.md" + - "kubernetes/enhancements/keps/sig-node/375-cpumanager/README.md" +superseded-by: + +# The target maturity stage in the current dev cycle for this KEP. +stage: stable + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.26" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.8" + beta: "v1.10" + stable: "v1.26" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: CPUManager + components: + - kubelet +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - cpu_manager_pinning_requests_total + - cpu_manager_pinning_errors_total diff --git a/keps/sig-node/375-cpu-manager/kep.yaml b/keps/sig-node/375-cpu-manager/kep.yaml deleted file mode 100644 index dc9e3720a10..00000000000 --- a/keps/sig-node/375-cpu-manager/kep.yaml +++ /dev/null @@ -1,25 +0,0 @@ -title: CPU Manager -kep-number: 375 -authors: - - "@ConnorDoyle" - - "@flyingcougar" - - "@sjenning" -owning-sig: sig-node -participating-sigs: - - sig-node -reviewers: - - "@derekwaynecarr" -approvers: - - "@dawnchen" - - "@derekwaynecarr" -editor: Connor Doyle -creation-date: 2017-05-23 -last-updated: 2017-05-23 -status: implementable -see-also: -replaces: - - " kubernetes/community/contributors/design-proposals/node/cpu-manager.md" -superseded-by: - -latest-milestone: "0.0" -stage: "alpha"