From 553bc5d3a6a267a173d4d968409efb4b6d2ca5bc Mon Sep 17 00:00:00 2001 From: Jefftree Date: Wed, 8 Jun 2022 17:16:21 +0000 Subject: [PATCH] Aggregated Discovery KEP --- .../sig-api-machinery/3352.yaml | 5 + .../3352-aggregated-discovery/README.md | 758 ++++++++++++++++++ .../3352-aggregated-discovery/kep.yaml | 48 ++ 3 files changed, 811 insertions(+) create mode 100644 keps/prod-readiness/sig-api-machinery/3352.yaml create mode 100644 keps/sig-api-machinery/3352-aggregated-discovery/README.md create mode 100644 keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml diff --git a/keps/prod-readiness/sig-api-machinery/3352.yaml b/keps/prod-readiness/sig-api-machinery/3352.yaml new file mode 100644 index 000000000000..6b204c391767 --- /dev/null +++ b/keps/prod-readiness/sig-api-machinery/3352.yaml @@ -0,0 +1,5 @@ +kep-number: 3352 +alpha: + approver: "@deads2k" +beta: + approver: "@deads2k" diff --git a/keps/sig-api-machinery/3352-aggregated-discovery/README.md b/keps/sig-api-machinery/3352-aggregated-discovery/README.md new file mode 100644 index 000000000000..cec79e907cf8 --- /dev/null +++ b/keps/sig-api-machinery/3352-aggregated-discovery/README.md @@ -0,0 +1,758 @@ + +# KEP-3352: Aggregated Discovery + + + + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [API](#api) + - [Aggregation](#aggregation) + - [Client](#client) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Deprecation](#deprecation) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + + + +## Motivation + + + +All clients and users of Kubernetes APIs usually first need to “discover” what the available APIs are and how they can be used. These APIs are described through a mechanism called “Discovery” which is typically queried to then build the requests to correct APIs. Unfortunately, the “Discovery” API is made of lots of small objects that need to be queried individually, causing possibly a lot of delay due to the latency of each individual request (up to 80 requests, with most objects being less than 1,024 bytes). The more complex the APIs provided by the Kubernetes cluster, the more requests need to be performed. + +The most well known Kubernetes client that uses the discovery mechanism is `kubectl`, and more specifically the `CachedDiscoveryClient` in `client-go`. To mitigate some of this latency, kubectl has implemented a (10 minute timer) 6 hours(!) timer during which the discovery API is not refreshed. The drawback of this approach is that the freshness of the cache is doubtful and the entire discovery API needs to be refreshed after 6 hours, even if it hasn’t expired. + +This not only impacts kubectl, but all clients of kubernetes. We can do better. + +### Goals + +- Fix the discovery storm issue currently present in kubectl +- Aggregate the discovery documents for all Kubernetes types + + + +### Non-Goals + + + +- Since the current discovery separated by group-version is already GA, removal of the endpoint will be difficult. This KEP will solely focus on introducing the new aggregated endpoint and will not cover deprecation. + +## Proposal + +We are proposing a new endpoint `/discovery` as an aggregated endpoint for all discovery documents. Discovery documents can currently be found under `apis//` and `/api/v1` for the legacy group version. This discovery endpoint will support publishing an ETag so clients who already have the latest version of the aggregated discovery can avoid redownloading the document. + +We will add a new controller responsible for aggregating the discovery documents when a resource on the cluster changes. There will be no conflicts when aggregating since each discovery document is self-contained. + +### Notes/Constraints/Caveats (Optional) + + + +### Risks and Mitigations + + + +## Design Details + + + +We will expose a endpoint `/discovery` that will support JSON, and +protobuf. The endpoint will serve the aggregated discovery document +for all types that a Kubernetes cluster supports. + +### API + +The contents of this endpoint will be an `APIGroupList`, which is the +same type that is returned in the discovery document at `/apis`. The +`APIGroupList` contains a list of `APIGroup` types which includes a +list of +[`GroupVersionForDiscovery`](https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/apimachinery/pkg/apis/meta/v1/types.go#L1071) +types. We will modify this `GroupVersionForDiscovery` type to include +a list of +[`APIResource`](https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/apimachinery/pkg/apis/meta/v1/types.go#L1080). +This list of `APIResource` is what is currently published at the +`/apis//version` endpoint and is what will be aggregated into +the new endpoint that we publish. + +The endpoint will also publish an ETag calculated based on a hash of +the data for clients. + +### Aggregation + +For the aggregation layer on the server, a new controller will be created to aggregate discovery for built-in types, apiextensions types (CRDs), and types from aggregated api servers. + +### Client + +The `client-go` interface will be modified to add a new method to retrieve the aggregated discovery document and `kubectl` will be the initial candidate. As a starting point, `kubectl api-resources` should use the aggregated discovery document instead of sending a storm of requests. + +### Test Plan + + + +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +##### Unit tests + + + + + +This will be implemented in a new package in kube-aggregator. + +##### Integration tests + + + +For alpha, integration tests will be added to exercise the new aggregated discovery code path. + +##### e2e tests + + + +For alpha, tests will be added to exercise the new aggregated discovery code path for kubectl, both on the server and client side. + +### Graduation Criteria + + + +#### Alpha + +- Feature implemented behind a feature flag +- Initial e2e tests completed and enabled +- At least one client (kubectl) has an implementation to use the aggregated discovery feature + +#### Beta + +- kubectl uses the aggregated discovery feature by default + +#### GA + +- TBD + +**Note:** Generally we also wait at least two releases between beta and +GA/stable, because there's no opportunity for user feedback, or even bug reports, +in back-to-back releases. + +**For non-optional features moving to GA, the graduation criteria must include +[conformance tests].** + +[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md + +#### Deprecation + + +### Upgrade / Downgrade Strategy + +Aggregated discovery will be behind a feature gate. It is an in-memory feature and upgrade/downgrade is not a problem. + +### Version Skew Strategy + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: AggregatedDiscovery + - Components depending on the feature gate: kube-apiserver + +###### Does enabling the feature change any default behavior? + +No + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + +Yes, the feature may be disabled by reverting the feature flag. + +###### What happens if we reenable the feature if it was previously rolled back? + +The feature does not depend on state, and can be disabled/enabled at will. + +###### Are there any tests for feature enablement/disablement? + + + +n/a + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + +With aggregation, the size of the aggregated discovery document could be an issue in the future since clients will need to download the entire document on any resource update. At the moment, even with 3000 CRDs (already very unlikely), the total size is still smaller than 1MB. + +## Alternatives + + diff --git a/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml b/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml new file mode 100644 index 000000000000..32f20b651b20 --- /dev/null +++ b/keps/sig-api-machinery/3352-aggregated-discovery/kep.yaml @@ -0,0 +1,48 @@ +title: Aggregated Discovery +kep-number: 3352 +authors: + - "@alexzielenski" + - "@jefftree" +owning-sig: sig-api-machinery +participating-sigs: + - sig-cli +status: implementable +creation-date: 2022-06-07 +reviewers: + - "@apelisse" + - "@seans3" +approvers: + - "@deads2k" + - "@lavalamp" + +##### WARNING !!! ###### +# prr-approvers has been moved to its own location +# You should create your own in keps/prod-readiness +# Please make a copy of keps/prod-readiness/template/nnnn.yaml +# to keps/prod-readiness/sig-xxxxx/00000.yaml (replace with kep number) +prr-approvers: + - "@deads2k" + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.25" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.25" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: AggregatedDiscovery + components: + - kube-apiserver +disable-supported: true + +# The following PRR answers are required at beta release +# metrics: +# - my_feature_metric