From d75683f5078c6b14d74b4219204436cb2b78ec3a Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Wed, 19 May 2021 11:59:06 -0700 Subject: [PATCH 1/7] KEP 2763: Ambient capabilities in Kubernetes --- keps/prod-readiness/sig-security/2763.yaml | 3 + .../2763-ambient-capabilities/README.md | 928 ++++++++++++++++++ .../2763-ambient-capabilities/kep.yaml | 50 + 3 files changed, 981 insertions(+) create mode 100644 keps/prod-readiness/sig-security/2763.yaml create mode 100644 keps/sig-security/2763-ambient-capabilities/README.md create mode 100644 keps/sig-security/2763-ambient-capabilities/kep.yaml diff --git a/keps/prod-readiness/sig-security/2763.yaml b/keps/prod-readiness/sig-security/2763.yaml new file mode 100644 index 00000000000..98538cdf54f --- /dev/null +++ b/keps/prod-readiness/sig-security/2763.yaml @@ -0,0 +1,3 @@ +kep-number: 2763 +beta: + approver: "@ehashman" diff --git a/keps/sig-security/2763-ambient-capabilities/README.md b/keps/sig-security/2763-ambient-capabilities/README.md new file mode 100644 index 00000000000..d7beb1e7aa6 --- /dev/null +++ b/keps/sig-security/2763-ambient-capabilities/README.md @@ -0,0 +1,928 @@ +# KEP-2763: Support Ambient Capabilities in Kubernetes. + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Story 3](#story-3) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) + - [Restricted ambient capabilities.](#restricted-ambient-capabilities) + - [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) + - [Changes to containerd/containerd (https://github.com/containerd/containerd)](#changes-to-containerdcontainerd-httpsgithubcomcontainerdcontainerd) + - [Order of changes](#order-of-changes) + - [Test Plan](#test-plan) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +[Ambient capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html) is a set of capabilities that are preserved across an execve(2) of a program that is not privileged. This KEP proposes that kubernetes provide a way to set ambient capabilities for containers through the Pod manifest. It also proposes changes that must be made to `containerd` and `CRI-O` to enable ambient capabilities end-to-end. + +## Motivation + +Running containers as non-root has been a long recommended best-practice in kubernetes and we have published [blogs](https://kubernetes.io/blog/2018/07/18/11-ways-not-to-get-hacked/#8-run-containers-as-a-non-root-user) recommending this best practice. In addition to running as non-root it is also recommended that all capabilities other than the ones required are dropped from the container. Since most containers don’t require any capabilities this guidance becomes easy to follow. + +NOTE: In the example below net_bind_service is only userd for demonstration purposes, in no way are we scoping this KEP just to net_bind_service. See the [User Stories (Optional)](#user-stories-optional) section below for other cases where ambient capabilities can help. + +**The following works:-** +```Dockerfile +FROM ubuntu + +COPY main /bin/simpleserver +``` + +``` +docker build -t simpleserver:nofilecaps . +``` + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["All"] + image: simpleserver:nofilecaps + command: ["simpleserver", "--port", "8080"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 1/1 Running 0 10s +``` + +**The following does not work:** because the container is running as non-root user `1000` and since it is mounting a privileged port it will require `CAP_NET_BIND_SERVICE` linux capability. + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["All"] + add: ["NET_BIND_SERVICE"] + image: simpleserver:nofilecaps + command: ["simpleserver", "--port", "80"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 0/1 CrashLoopBackOff 1 6s + +kubectl logs static-web +2021/06/05 18:21:31 About to listen on port: 80 +2021/06/05 18:21:31 http.ListenAndServe(:80, nil) failed with err: listen tcp :80: bind: permission denied +``` + +Capabilities that are either added explicitly in the manifest or by default to a non-root container do not get added to its effective and permitted set because effective and permitted sets get cleared when you transition from UID 0 to UID !0. To get around this today users have to apply the capabilities to the binary in their image build phase. + +**A user could apply file capabilities as shown below :-** + +```Dockerfile +FROM ubuntu + +COPY simpleserver /bin/simpleserver + +RUN apt-get update && apt-get -y --no-install-recommends install libcap2-bin + +RUN setcap cap_net_bind_service=+ep /bin/simpleserver +``` + +``` +docker build -t simpleserver:filecaps . +``` + +**Now the if we update the image and create the pod:-** + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["All"] + add: ["NET_BIND_SERVICE"] + image: simpleserver:filecaps + command: ["simpleserver", "--port", "80"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 0/1 CrashLoopBackOff 1 17s + +kubectl logs static-web +2021/06/05 18:25:09 About to listen on port: 80 +2021/06/05 18:25:09 http.ListenAndServe(:80, nil) failed with err: listen tcp :80: bind: permission denied +``` + +**Note** The above does not work because we cannot set `allowPrivilegeEscalation: false` anymore because `allowPrivilegeEscalation` directly controls the `no_new_privs` flag. With `no_new_privs` set, file capabilities are not added to the permitted set. + +**Now updating the Pod by removing `allowPrivilegeEscalation: false`:-** + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + capabilities: + drop: ["All"] + add: ["NET_BIND_SERVICE"] + image: simpleserver:filecaps + command: ["simpleserver", "--port", "80"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 1/1 Running 0 9s +``` + +**Note:** While applying the capabilities to the binary during the image build is a work-around, it should be noted that now even if we switched back to port 8080 in the above example we would have to add the capability in the container's `SecurityContext`. + +**If we update the port to non-privileged but use the setcap image:-** + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["All"] + image: simpleserver:filecaps + command: ["simpleserver", "--port", "8080"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 0/1 CrashLoopBackOff 1 11s + +kubectl logs static-web +standard_init_linux.go:211: exec user process caused "operation not permitted" +``` + +**Now if we add cap_net_bind_service to the container even though it is not needed, it works** + +```yaml +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["All"] + add: ["NET_BIND_SERVICE"] + image: simpleserver:filecaps + command: ["simpleserver", "--port", "8080"] +``` + +``` +kubectl apply -f pod.yaml +pod/static-web created + +kubectl get pods +NAME READY STATUS RESTARTS AGE +static-web 1/1 Running 0 10s +``` + +While applying capabilities to the binary during image build is a work-around it is not always feasible to do so as a lot of users use 3rd party images, so in order to run containers as non-root in an image whose build you do not control you would essentially have to do something like: + +```Dockerfile +FROM debian-base:buster-v1.4.0 +COPY --from=imageThatIWantToRunAsNonRoot /binaryThatIWantToRunAsNonRoot /binaryThatIWantToRunAsNonRoot +RUN apt-get update \ + && apt-get -y --no-install-recommends install libcap2-bin +RUN setacp cap_my_binary_needs=+ep /binaryThatIWantToRunAsNonRoot + +FROM imageThatIWantToRunAsNonRoot +# override the binary in the image +COPY --from=0 /binaryThatIWantToRunAsNonRoot /binaryThatIWantToRunAsNonRoot +``` + +Here we override the binary in the original image with a setcaped binary and push the image to our private repository. While this is a workaround to the problem if you don’t control the build, you are now forced to effectively maintain another copy of the image. + +The way capabilities are today designed is that they are effective in limiting the capabilities of a user but we lack the ability to grant capabilities to a user unless we also control the build of an image. + +### Goals + +- Enable support for ambient capabilities for Kubernetes and the `containerd` and `CRI-O` runtime. +- Define a safe set of capabilities that should be allowed to be added to the ambient set. + +### Non-Goals + + +- Runtimes other than `containerd` and `CRI-O` are not in scope at the moment. +- Windows is not covered in this design. + +## Proposal + +- We propose updating the kubernetes core and CRI API's to allow users to configure adding capabilities to the ambient set. This would allow users to run non-root containers without having to apply file capabilities to the binary during image build time. Details on how and where these changes will be made are explained in the [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) and [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) sections below. + +- We also propose that container runtimes like `containerd` and `CRI-O` be updated to account for these changes to the CRI apis and add the requested capabilities to the ambient set when they create the containers. Details on how and where these changes will be made are explained in the [Changes to containerd/containerd (https://github.com/containerd/containerd)](#changes-to-containerdcontainerd-httpsgithubcomcontainerdcontainerd) section below. + +### User Stories (Optional) + +<<[UNRESOLVED] How much demand is there for this feature outside of NET_BIND_SERVICE>> + +_Blocking for Alpha._ + +Is there enough demand for this feature today? + +<<[/UNRESOLVED]>> + +#### Story 1 +As a security conscious user I should be able to run my container as non-root and add the minimum set of capabilities required for it to function even when I do not control how the image is built. + +#### Story 2 +@Jc2K [commented](https://github.com/kubernetes/enhancements/pull/2757#issuecomment-906729690) +If you were running one of the popular IDS's or some other network monitoring tool like moloch/arike on your cluster as a `DaemonSet` then i'd expect you could run it as a non root user with CAP_NET_RAW (and possibly CAP_NET_ADMIN). + +I personally have run `DaemonSet`'s that would use `CAP_NET_ADMIN` to configure some environment specific networking. I would have run this as non-root if I had this KEP. + +Yes, `CAP_NET_ADMIN` is a pretty elevated permission. Most of the use cases I can think are things i'd expect the team deploying the CNI to be interested in, rather than the person deploying something on port 443 (where they are different people). I'd still prefer to deploy them not as root, where that actually works. + +#### Story 3 +@Jc2K [commented](https://github.com/kubernetes/enhancements/pull/2757#discussion_r697016426) +The most useful use of this is where you have to run something that does have to have pretty elevated permissions. Like an IDS, or a network visibility tool, or some daemon that configures some weird aspect of your network stack. You can't get away from the fact that these things are running and they are running with NET_ADMIN or that they are in the host network namespace. Let's set as many restrictions on them as we can and run them as non-root. + +### Risks and Mitigations +One risk of adding this feature is that it could be used to give non-root users elevated permissions on the host. For example one could add `cap_sys_admin` to the ambient capabilities set of a container making it essentially root on host. To mitigate this we propose that for alpha we only support a carefully curated set of "safe" capabilities that are most commonly required by workload. + +## Design Details + +### Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1) + +<<[UNRESOLVED pick how we want to update the K8S APIs]>> + +_Blocking for Alpha._ + +There are 2 options here:- +- Option 1: Reuse Add field in [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) + + When a capability gets added explicitly to a non-root container it also gets added to the ambient set in addition to getting added to inheritable, permitted, bounding and effective sets. The default capabilities are not added to the ambient set. + + Pros and Cons + + :+1: Simple add and drop API is easy to use from a user perspective. + + :+1: No new field needs to added to the K8S API Capabilities object. + + :-1: How would we tell the difference at API validation time? I assume we'd have to look at mustRunAsNonRoot, because the apiserver has no idea whether a textual runAsUser is root or not. mustRunAsNonRoot it designed to fail at the last minute, in the Kubelet, which is bad UX. This makes this approach very awkward. Difrrent behavior for root vs non-root might be confusing for users. + + :-1: Since we are changing the default behavior for non-root containers we might break existing user containers. Issues with linux capabilities have proven themselves to be very hard to debug in the past. + +- Option 2: Add new field to [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) + + When a capability is added using this field only then does it get added to the ambient set in addition to inheritable, permitted, bounding and effective sets. The behavior is consistent for root and non-root containers. + + ```diff + type Capabilities struct { + // Added capabilities + // +optional + Add []Capability `json:"add,omitempty" protobuf:"bytes,1,rep,name=add,casttype=Capability"` + // Removed capabilities + // +optional + Drop []Capability `json:"drop,omitempty" protobuf:"bytes,2,rep,name=drop,casttype=Capability"` + + // Ambient capabilities to add + + // +optional + + Ambient []Capability `json:"ambient,omitempty" protobuf:"bytes,3,rep,name=ambient,casttype=Capability"` + } + ``` + + Pros and Cons + + :+1: Since this is a new field existing user workloads are unaffected by this change. + + :+1: The capabilities struct only really translates to Linux Capabilities, adding a new field does make it easier for users who understand Linux capabilities to confiure them. + + :+1: Ambient caps are useful for root containers as well.A root process can control what caps it gives to child processes that are running as non-root. The capabilities list list for these non-root processes may not be the same as the root caps in Add. + + :-1: Requires changes to the K8S API Capabilities object. + +<<[/UNRESOLVED]>> + +`Ambient` capabilities will adhere to the following rules: + +1. Ambient capabilities can only be added. +2. Default capabilities are not added to the ambient capabilities set. By default the container has an empty ambient capability set, +3. Only capability that are explicitly added in the manifest will be added to the ambient set. +4. Since the ambient capability set obeys the invariant that no capability can ever be ambient if it is not both permitted and inheritable, adding a capability to the ambient set will also add it to the permitted and inheritable set. + +#### Restricted ambient capabilities. + +<<[UNRESOLVED what is the set of capabilities that we should allow to be made ambient]>> + +_Target Beta, non-blocking for Alpha._ + +Some capabilities like `CAP_SYS_ADMIN` and `CAP_DAC_OVERRIDE` make a non-root user very powerful and we should restrict them from being added to the ambient capabilities set. + +<<[/UNRESOLVED]>> + +### Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2) + +We propose adding a new field to the [Capability](https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2#Capability) struct + +```diff +type Capability struct { + // List of capabilities to add. + AddCapabilities []string `protobuf:"bytes,1,rep,name=add_capabilities,json=addCapabilities,proto3" json:"add_capabilities,omitempty"` + // List of capabilities to drop. + DropCapabilities []string `protobuf:"bytes,2,rep,name=drop_capabilities,json=dropCapabilities,proto3" json:"drop_capabilities,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_sizecache int32 `json:"-"` ++ // List of ambient capabilities to add. ++ AddAmbientCapabilities []string `protobuf:"bytes,3,rep,name=add_ambient_capabilities,json=addAmbientCapabilities,proto3" json:"add_ambient_capabilities,omitempty"` +} +``` + +Adding apabilities to the `ambient` field in the `capabilities` field of the containers `securityContext` will directly update this field, and these changes would live in the [convertToRuntimeCapabilities](https://github.com/kubernetes/kubernetes/blob/ea0764452222146c47ec826977f49d7001b0ea8c/pkg/kubelet/kuberuntime/security_context.go#L121:6) function. + + +### Changes to containerd/containerd (https://github.com/containerd/containerd) + +`containerd` clears all ambient capabilities on container creation. See [this](https://github.com/containerd/containerd/blob/055c801ededcb7a5e82f47bdeed555cdf6c64bd8/pkg/cri/server/container_create_linux.go#L233) link for details. + +```go +// Clear all ambient capabilities. The implication of non-root + caps +// is not clearly defined in Kubernetes. +// See https://github.com/kubernetes/kubernetes/issues/56374 +// Keep docker's behavior for now. +specOpts = append(specOpts, + customopts.WithoutAmbientCaps, + customopts.WithSelinuxLabels(processLabel, mountLabel), +) +``` + +We would need to update code at that link to add the ambient capabilities by calling the [WithAmbientCapabilities](https://github.com/containerd/containerd/blob/ab963e1cc16a845567a0e3e971775c29c701fcf8/oci/spec_opts.go#L858) function instead of the [WithoutAmbientCapabilities](https://github.com/containerd/containerd/blob/04f73e3f8a097d95111f8419fa136d196b3a8725/pkg/cri/opts/spec_linux.go#L354) function. + +### Order of changes +We propose that the changes be made in the following order: + +1. Changes described in the [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) section are made first as both containerd and kubernetes rely on these. +2. Changes described in the [Changes to containerd/containerd (https://github.com/containerd/containerd)](#changes-to-containerdcontainerd-httpsgithubcomcontainerdcontainerd) are made next. +3. Once a version of `containerd` that supports ambient capabilities is available we make the changes described in the [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) section. + + +### Test Plan + + + + +### Graduation Criteria +Maturity level of this feature is defined by the AmbientCapabilities feature-gate. + +#### Alpha + +The alpha of this feature is about making sure that all the infrastructure +required to enable ambient capabilities is in place. It includes the following work:- + +1. The following sections need to be resolved before Alpha:- + - [ ] [User Stories (Optional)](#user-stories-optional) + - [ ] [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) + +2. AmbientCapabilities feature-gate added in default off configuration(feature is opt-in by default) +3. Changes to CRI API as described in section [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) implemented. +4. A version of containerd and CRI-O that properly sets the ambient capabilities as described in section [Changes to containerd/containerd (https://github.com/containerd/containerd)](#changes-to-containerdcontainerd-httpsgithubcomcontainerdcontainerd) available. +5. Changes to k8s API as described in [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) implemented. +6. Update to kubelet code to sets ambient capabilities and these changes are hidden behind the feature-gate. +7. e2e tests with the version of containerd that supports ambient capabilities with the feature gate enabled in kubelet passing in TestGrid. + +#### Beta + +While alpha was focusd on setting up the infra for this feature, beta will focus on the safe enablement. + +1. Collect feedback from Alpha and use information to guide the following sections need to be resolved before Beta. + - [] [Restricted ambient capabilities.](#restricted-ambient-capabilities) +2. Update Pod validation to prevent restricted capabilities from being added based on how the section above is resolved. +3. Enabled `AmbientCapabilites` feature-gate by default. (feature is opt-out by default) +4. Thorough testing is already expected for alpha, but we will review our test coverage and fill any gaps prior to beta. + +#### GA + +<<[UNRESOLVED]>> +_Blocking for GA, non-blocking for Alpha and Beta._ + +- Examples of real world usage and positive user feedback. + +<<[/UNRESOLVED]>> + + +### Upgrade / Downgrade Strategy + + +There is no toil expected on upgrade of existing workloads. But, if a `container` has capabilities in the `ambient` field in the `capabilities` field of the `securityContext` and the cluster is downgraded to n-1 then the workload with the new field wouldn't get the ambient capabilities and may fail. + +### Version Skew Strategy + + +If a kubelet that is not aware of the `ambient` field in `capabilities` field of the `securityContext` then it will simply ignore the field. We propose adding a taint/toleration/well known label to mark nodes that can support ambient capability requests. + +If the node is running a version of `containerd`/`CRI-O` that does not support ambient capabilities then the workload may fail even if it is supported by kubelet and the `AmbientCapabilities` feature-gate is enabled. To prevent this from happening we plan to add documentation to specify the version of containerd and CRI-O that support ambient capabilities. + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: AmbientCapabilities + - Components depending on the feature gate: `kubelet`, `kube-apiserver` + +###### Does enabling the feature change any default behavior? + + +No + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + +Yes + +###### What happens if we reenable the feature if it was previously rolled back? + +If there are no containers which specify the `ambient` capabilities to add in their `securityContext` then nothing will change. +If there are containers which specify `ambient` capabilities in their `securityContext` then these containers will have the capability or capabilities in their ambient capability set. + +###### Are there any tests for feature enablement/disablement? + + +The necessary unit tests in kubelet dealing with creation of Pods with ambient capabilities with the feature gate on and off will be added. + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + +No. + +###### Will enabling / using this feature result in introducing new API types? + + +No. + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + +No. + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + +No. + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + +No, + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + +No. + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + +N/A. + +## Alternatives +- Including `NET_BIND_SERVICE` in the ambient set by default. + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-security/2763-ambient-capabilities/kep.yaml b/keps/sig-security/2763-ambient-capabilities/kep.yaml new file mode 100644 index 00000000000..9420b094e1d --- /dev/null +++ b/keps/sig-security/2763-ambient-capabilities/kep.yaml @@ -0,0 +1,50 @@ +title: KEP Template +kep-number: 2763 +authors: + - "@vinayakankugoyal" +owning-sig: sig-security +participating-sigs: + - sig-node +status: provisional +creation-date: 2021-05-20 +reviewers: + - "@PushkarJ" # sig-security + - "@tallclair" # sig-auth + - "@mrunalp" # sig-node and CRI-O + - "@mikebrow" # sig-node and containerd + - "@SergeyKanzhelev" # sig-node and CRI API +approvers: + - TBD +prr-approvers: + - "@ehashman" +see-also: + - TBD +replaces: + - TBD + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "alpha" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "1.23" + beta: "TBD" + stable: "TBD" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: AmbientCapabilities + components: + - kubelet + - kube-apiserver +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - TBD From 733ff2eb386ec7beddada435ce8d5c883efe93ee Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Tue, 14 Sep 2021 12:29:40 +0300 Subject: [PATCH 2/7] KEP 2763: Add use cases and polish Signed-off-by: Alexey Perevalov --- .../2763-ambient-capabilities/README.md | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/keps/sig-security/2763-ambient-capabilities/README.md b/keps/sig-security/2763-ambient-capabilities/README.md index d7beb1e7aa6..c87fa78ebfc 100644 --- a/keps/sig-security/2763-ambient-capabilities/README.md +++ b/keps/sig-security/2763-ambient-capabilities/README.md @@ -47,7 +47,7 @@ Items marked with (R) are required *prior to targeting to a milestone / release* - [ ] (R) Design details are appropriately documented - [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) - [ ] e2e Tests for all Beta API Operations (endpoints) - - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free - [ ] (R) Graduation criteria is in place - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) @@ -70,7 +70,7 @@ Items marked with (R) are required *prior to targeting to a milestone / release* Running containers as non-root has been a long recommended best-practice in kubernetes and we have published [blogs](https://kubernetes.io/blog/2018/07/18/11-ways-not-to-get-hacked/#8-run-containers-as-a-non-root-user) recommending this best practice. In addition to running as non-root it is also recommended that all capabilities other than the ones required are dropped from the container. Since most containers don’t require any capabilities this guidance becomes easy to follow. -NOTE: In the example below net_bind_service is only userd for demonstration purposes, in no way are we scoping this KEP just to net_bind_service. See the [User Stories (Optional)](#user-stories-optional) section below for other cases where ambient capabilities can help. +NOTE: In the example below net_bind_service is only used for demonstration purposes, in no way are we scoping this KEP just to net_bind_service. See the [User Stories (Optional)](#user-stories-optional) section below for other cases where ambient capabilities can help. **The following works:-** ```Dockerfile @@ -108,7 +108,7 @@ spec: kubectl apply -f pod.yaml pod/static-web created -kubectl get pods +kubectl get pods NAME READY STATUS RESTARTS AGE static-web 1/1 Running 0 10s ``` @@ -239,7 +239,7 @@ NAME READY STATUS RESTARTS AGE static-web 1/1 Running 0 9s ``` -**Note:** While applying the capabilities to the binary during the image build is a work-around, it should be noted that now even if we switched back to port 8080 in the above example we would have to add the capability in the container's `SecurityContext`. +**Note:** While applying the capabilities to the binary during the image build is a work-around, it should be noted that now even if we switched back to port 8080 in the above example we would have to add the capability in the container's `SecurityContext`. **If we update the port to non-privileged but use the setcap image:-** @@ -325,7 +325,7 @@ COPY --from=0 /binaryThatIWantToRunAsNonRoot /binaryThatIWantToRunAsNonRoot Here we override the binary in the original image with a setcaped binary and push the image to our private repository. While this is a workaround to the problem if you don’t control the build, you are now forced to effectively maintain another copy of the image. -The way capabilities are today designed is that they are effective in limiting the capabilities of a user but we lack the ability to grant capabilities to a user unless we also control the build of an image. +The way capabilities are today designed is that they are effective in limiting the capabilities of a user but we lack the ability to grant capabilities to a user unless we also control the build of an image. ### Goals @@ -356,6 +356,7 @@ _Blocking for Alpha._ Is there enough demand for this feature today? <<[/UNRESOLVED]>> +This might be useful for those who do their own kubernetes installation, and put CNI plugins in the pod, which is a very common practice nowadays. A rarer practice is to load BPF also for networking or security tracking (CAP_BPF is available since linux kernel version 5.8), [for example KubeArmor loads BPF for security purpose (https://github.com/kubearmor/KubeArmor)]. There are cases where system components are deployed in Pod which even load a kernel modules. CAP_PTRACE could be used for debugging purpose and CAP_AUDIT could be used by intrusion detection systems. #### Story 1 As a security conscious user I should be able to run my container as non-root and add the minimum set of capabilities required for it to function even when I do not control how the image is built. @@ -394,7 +395,7 @@ There are 2 options here:- :+1: No new field needs to added to the K8S API Capabilities object. - :-1: How would we tell the difference at API validation time? I assume we'd have to look at mustRunAsNonRoot, because the apiserver has no idea whether a textual runAsUser is root or not. mustRunAsNonRoot it designed to fail at the last minute, in the Kubelet, which is bad UX. This makes this approach very awkward. Difrrent behavior for root vs non-root might be confusing for users. + :-1: How would we tell the difference at API validation time? I assume we'd have to look at mustRunAsNonRoot, because the apiserver has no idea whether a textual runAsUser is root or not. mustRunAsNonRoot it designed to fail at the last minute, in the Kubelet, which is bad UX. This makes this approach very awkward. Different behavior for root vs non-root might be confusing for users. :-1: Since we are changing the default behavior for non-root containers we might break existing user containers. Issues with linux capabilities have proven themselves to be very hard to debug in the past. @@ -420,7 +421,7 @@ There are 2 options here:- :+1: Since this is a new field existing user workloads are unaffected by this change. - :+1: The capabilities struct only really translates to Linux Capabilities, adding a new field does make it easier for users who understand Linux capabilities to confiure them. + :+1: The capabilities struct only really translates to Linux Capabilities, adding a new field does make it easier for users who understand Linux capabilities to configure them. :+1: Ambient caps are useful for root containers as well.A root process can control what caps it gives to child processes that are running as non-root. The capabilities list list for these non-root processes may not be the same as the root caps in Add. @@ -462,12 +463,12 @@ type Capability struct { } ``` -Adding apabilities to the `ambient` field in the `capabilities` field of the containers `securityContext` will directly update this field, and these changes would live in the [convertToRuntimeCapabilities](https://github.com/kubernetes/kubernetes/blob/ea0764452222146c47ec826977f49d7001b0ea8c/pkg/kubelet/kuberuntime/security_context.go#L121:6) function. +Adding capabilities to the `ambient` field in the `capabilities` field of the containers `securityContext` will directly update this field, and these changes would live in the [convertToRuntimeCapabilities](https://github.com/kubernetes/kubernetes/blob/ea0764452222146c47ec826977f49d7001b0ea8c/pkg/kubelet/kuberuntime/security_context.go#L121:6) function. ### Changes to containerd/containerd (https://github.com/containerd/containerd) -`containerd` clears all ambient capabilities on container creation. See [this](https://github.com/containerd/containerd/blob/055c801ededcb7a5e82f47bdeed555cdf6c64bd8/pkg/cri/server/container_create_linux.go#L233) link for details. +`containerd` clears all ambient capabilities on container creation. See [this](https://github.com/containerd/containerd/blob/055c801ededcb7a5e82f47bdeed555cdf6c64bd8/pkg/cri/server/container_create_linux.go#L233) link for details. ```go // Clear all ambient capabilities. The implication of non-root + caps @@ -480,7 +481,7 @@ specOpts = append(specOpts, ) ``` -We would need to update code at that link to add the ambient capabilities by calling the [WithAmbientCapabilities](https://github.com/containerd/containerd/blob/ab963e1cc16a845567a0e3e971775c29c701fcf8/oci/spec_opts.go#L858) function instead of the [WithoutAmbientCapabilities](https://github.com/containerd/containerd/blob/04f73e3f8a097d95111f8419fa136d196b3a8725/pkg/cri/opts/spec_linux.go#L354) function. +We would need to update code at that link to add the ambient capabilities by calling the [WithAmbientCapabilities](https://github.com/containerd/containerd/blob/ab963e1cc16a845567a0e3e971775c29c701fcf8/oci/spec_opts.go#L858) function instead of the [WithoutAmbientCapabilities](https://github.com/containerd/containerd/blob/04f73e3f8a097d95111f8419fa136d196b3a8725/pkg/cri/opts/spec_linux.go#L354) function. ### Order of changes We propose that the changes be made in the following order: @@ -523,8 +524,8 @@ required to enable ambient capabilities is in place. It includes the following w - [ ] [User Stories (Optional)](#user-stories-optional) - [ ] [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) -2. AmbientCapabilities feature-gate added in default off configuration(feature is opt-in by default) -3. Changes to CRI API as described in section [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) implemented. +2. AmbientCapabilities feature-gate is added in to the default configuration as disabled(feature is opt-in by default) +3. Changes to CRI API as described in section [Changes to runtime API (https://pkg.go.dev/k8s.io/cri-api/pkg/apis/runtime/v1alpha2)](#changes-to-runtime-api-httpspkggodevk8siocri-apipkgapisruntimev1alpha2) are implemented. 4. A version of containerd and CRI-O that properly sets the ambient capabilities as described in section [Changes to containerd/containerd (https://github.com/containerd/containerd)](#changes-to-containerdcontainerd-httpsgithubcomcontainerdcontainerd) available. 5. Changes to k8s API as described in [Changes to kubernetes API (https://pkg.go.dev/k8s.io/api/core/v1)](#changes-to-kubernetes-api-httpspkggodevk8sioapicorev1) implemented. 6. Update to kubelet code to sets ambient capabilities and these changes are hidden behind the feature-gate. @@ -532,7 +533,7 @@ required to enable ambient capabilities is in place. It includes the following w #### Beta -While alpha was focusd on setting up the infra for this feature, beta will focus on the safe enablement. +While alpha was focused on setting up the infra for this feature, beta will focus on the safe enablement. 1. Collect feedback from Alpha and use information to guide the following sections need to be resolved before Beta. - [] [Restricted ambient capabilities.](#restricted-ambient-capabilities) @@ -718,10 +719,10 @@ Recall that end users cannot usually observe component logs or access metrics. --> - [ ] Events - - Event Reason: + - Event Reason: - [ ] API .status - - Condition name: - - Other field: + - Condition name: + - Other field: - [ ] Other (treat as last resort) - Details: From a19b9044bf1ace5602215cef1fd32a526e151626 Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Wed, 15 Sep 2021 10:52:37 -0700 Subject: [PATCH 3/7] update prod-readiness yaml. Change beta to alpha. --- keps/prod-readiness/sig-security/2763.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/prod-readiness/sig-security/2763.yaml b/keps/prod-readiness/sig-security/2763.yaml index 98538cdf54f..215b70ccb26 100644 --- a/keps/prod-readiness/sig-security/2763.yaml +++ b/keps/prod-readiness/sig-security/2763.yaml @@ -1,3 +1,3 @@ kep-number: 2763 -beta: +alpha: approver: "@ehashman" From a1745cc4009f38d7b466a90f3aa3bcdd36ec7378 Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Wed, 15 Sep 2021 18:07:35 -0700 Subject: [PATCH 4/7] Minor formatting fixes. --- keps/sig-security/2763-ambient-capabilities/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/keps/sig-security/2763-ambient-capabilities/README.md b/keps/sig-security/2763-ambient-capabilities/README.md index c87fa78ebfc..fe5c47d77a8 100644 --- a/keps/sig-security/2763-ambient-capabilities/README.md +++ b/keps/sig-security/2763-ambient-capabilities/README.md @@ -385,11 +385,11 @@ One risk of adding this feature is that it could be used to give non-root users _Blocking for Alpha._ There are 2 options here:- -- Option 1: Reuse Add field in [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) +- **Option 1:** Reuse Add field in [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) When a capability gets added explicitly to a non-root container it also gets added to the ambient set in addition to getting added to inheritable, permitted, bounding and effective sets. The default capabilities are not added to the ambient set. - Pros and Cons + **Pros and Cons** :+1: Simple add and drop API is easy to use from a user perspective. @@ -399,7 +399,7 @@ There are 2 options here:- :-1: Since we are changing the default behavior for non-root containers we might break existing user containers. Issues with linux capabilities have proven themselves to be very hard to debug in the past. -- Option 2: Add new field to [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) +- **Option 2:** Add new field to [Capabilities](https://pkg.go.dev/k8s.io/api/core/v1#Capabilities) When a capability is added using this field only then does it get added to the ambient set in addition to inheritable, permitted, bounding and effective sets. The behavior is consistent for root and non-root containers. @@ -417,13 +417,13 @@ There are 2 options here:- } ``` - Pros and Cons + **Pros and Cons** :+1: Since this is a new field existing user workloads are unaffected by this change. :+1: The capabilities struct only really translates to Linux Capabilities, adding a new field does make it easier for users who understand Linux capabilities to configure them. - :+1: Ambient caps are useful for root containers as well.A root process can control what caps it gives to child processes that are running as non-root. The capabilities list list for these non-root processes may not be the same as the root caps in Add. + :+1: Ambient caps are useful for root containers as well. A root process can control what caps it gives to child processes that are running as non-root. The capabilities list list for these non-root processes may not be the same as the root caps in Add. :-1: Requires changes to the K8S API Capabilities object. From 846575e89de862a362b9f3fad55d0386e475521a Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Sun, 3 Oct 2021 19:01:55 -0700 Subject: [PATCH 5/7] Delete 2763.yaml because merging provisionally. --- keps/prod-readiness/sig-security/2763.yaml | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 keps/prod-readiness/sig-security/2763.yaml diff --git a/keps/prod-readiness/sig-security/2763.yaml b/keps/prod-readiness/sig-security/2763.yaml deleted file mode 100644 index 215b70ccb26..00000000000 --- a/keps/prod-readiness/sig-security/2763.yaml +++ /dev/null @@ -1,3 +0,0 @@ -kep-number: 2763 -alpha: - approver: "@ehashman" From 3b84e9c884acaf85777ca4937fd482b742401f29 Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Wed, 6 Oct 2021 18:21:48 -0700 Subject: [PATCH 6/7] Update kep.yaml --- keps/sig-security/2763-ambient-capabilities/kep.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/keps/sig-security/2763-ambient-capabilities/kep.yaml b/keps/sig-security/2763-ambient-capabilities/kep.yaml index 9420b094e1d..f5a14fb27a4 100644 --- a/keps/sig-security/2763-ambient-capabilities/kep.yaml +++ b/keps/sig-security/2763-ambient-capabilities/kep.yaml @@ -9,12 +9,13 @@ status: provisional creation-date: 2021-05-20 reviewers: - "@PushkarJ" # sig-security + - "@tabbysable" # sig-security - "@tallclair" # sig-auth - "@mrunalp" # sig-node and CRI-O - "@mikebrow" # sig-node and containerd - "@SergeyKanzhelev" # sig-node and CRI API approvers: - - TBD + - "@tabbysable" prr-approvers: - "@ehashman" see-also: @@ -28,11 +29,11 @@ stage: alpha # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "alpha" +latest-milestone: "v1.23" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: - alpha: "1.23" + alpha: "v1.23" beta: "TBD" stable: "TBD" From 0cfb55c5b6dd4a0b511fd17701d86a778817b455 Mon Sep 17 00:00:00 2001 From: Vinayak Goyal Date: Wed, 6 Oct 2021 19:20:05 -0700 Subject: [PATCH 7/7] Update kep.yaml --- keps/sig-security/2763-ambient-capabilities/kep.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keps/sig-security/2763-ambient-capabilities/kep.yaml b/keps/sig-security/2763-ambient-capabilities/kep.yaml index f5a14fb27a4..c7e7529cd07 100644 --- a/keps/sig-security/2763-ambient-capabilities/kep.yaml +++ b/keps/sig-security/2763-ambient-capabilities/kep.yaml @@ -29,11 +29,11 @@ stage: alpha # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "v1.23" +latest-milestone: "v1.24" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: - alpha: "v1.23" + alpha: "v1.24" beta: "TBD" stable: "TBD"