Skip to content

Commit

Permalink
upstream: Implement retry concurrency budgets (#9069)
Browse files Browse the repository at this point in the history
Signed-off-by: Tony Allen <[email protected]>
  • Loading branch information
tonya11en authored and mattklein123 committed Jan 7, 2020
1 parent e7a7e8b commit 3ed917f
Show file tree
Hide file tree
Showing 26 changed files with 510 additions and 32 deletions.
1 change: 1 addition & 0 deletions api/envoy/api/v2/cluster/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ licenses(["notice"]) # Apache 2
api_proto_package(
deps = [
"//envoy/api/v2/core:pkg",
"//envoy/type:pkg",
"@com_github_cncf_udpa//udpa/annotations:pkg",
],
)
32 changes: 31 additions & 1 deletion api/envoy/api/v2/cluster/circuit_breaker.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ syntax = "proto3";
package envoy.api.v2.cluster;

import "envoy/api/v2/core/base.proto";
import "envoy/type/percent.proto";

import "google/protobuf/wrappers.proto";

Expand All @@ -23,8 +24,23 @@ option (udpa.annotations.file_migrate).move_to_package = "envoy.config.cluster.v
message CircuitBreakers {
// A Thresholds defines CircuitBreaker settings for a
// :ref:`RoutingPriority<envoy_api_enum_core.RoutingPriority>`.
// [#next-free-field: 8]
// [#next-free-field: 9]
message Thresholds {
message RetryBudget {
// Specifies the limit on concurrent retries as a percentage of the sum of active requests and
// active pending requests. For example, if there are 100 active requests and the
// budget_percent is set to 25, there may be 25 active retries.
//
// This parameter is optional. Defaults to 20%.
type.Percent budget_percent = 1;

// Specifies the minimum retry concurrency allowed for the retry budget. The limit on the
// number of active retries may never go below this number.
//
// This parameter is optional. Defaults to 3.
google.protobuf.UInt32Value min_retry_concurrency = 2;
}

// The :ref:`RoutingPriority<envoy_api_enum_core.RoutingPriority>`
// the specified CircuitBreaker settings apply to.
core.RoutingPriority priority = 1 [(validate.rules).enum = {defined_only: true}];
Expand All @@ -45,9 +61,23 @@ message CircuitBreakers {
// upstream cluster. If not specified, the default is 3.
google.protobuf.UInt32Value max_retries = 5;

// Specifies a limit on concurrent retries in relation to the number of active requests. This
// parameter is optional.
//
// .. note::
//
// If this field is set, the retry budget will override any configured retry circuit
// breaker.
RetryBudget retry_budget = 8;

// If track_remaining is true, then stats will be published that expose
// the number of resources remaining until the circuit breakers open. If
// not specified, the default is false.
//
// .. note::
//
// If a retry budget is used in lieu of the max_retries circuit breaker,
// the remaining retry resources remaining will not be tracked.
bool track_remaining = 6;

// The maximum number of connection pools per cluster that Envoy will concurrently support at
Expand Down
35 changes: 34 additions & 1 deletion api/envoy/config/cluster/v3alpha/circuit_breaker.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ syntax = "proto3";
package envoy.config.cluster.v3alpha;

import "envoy/config/core/v3alpha/base.proto";
import "envoy/type/v3alpha/percent.proto";

import "google/protobuf/wrappers.proto";

Expand All @@ -24,11 +25,29 @@ message CircuitBreakers {

// A Thresholds defines CircuitBreaker settings for a
// :ref:`RoutingPriority<envoy_api_enum_config.core.v3alpha.RoutingPriority>`.
// [#next-free-field: 8]
// [#next-free-field: 9]
message Thresholds {
option (udpa.annotations.versioning).previous_message_type =
"envoy.api.v2.cluster.CircuitBreakers.Thresholds";

message RetryBudget {
option (udpa.annotations.versioning).previous_message_type =
"envoy.api.v2.cluster.CircuitBreakers.Thresholds.RetryBudget";

// Specifies the limit on concurrent retries as a percentage of the sum of active requests and
// active pending requests. For example, if there are 100 active requests and the
// budget_percent is set to 25, there may be 25 active retries.
//
// This parameter is optional. Defaults to 20%.
type.v3alpha.Percent budget_percent = 1;

// Specifies the minimum retry concurrency allowed for the retry budget. The limit on the
// number of active retries may never go below this number.
//
// This parameter is optional. Defaults to 3.
google.protobuf.UInt32Value min_retry_concurrency = 2;
}

// The :ref:`RoutingPriority<envoy_api_enum_config.core.v3alpha.RoutingPriority>`
// the specified CircuitBreaker settings apply to.
core.v3alpha.RoutingPriority priority = 1 [(validate.rules).enum = {defined_only: true}];
Expand All @@ -49,9 +68,23 @@ message CircuitBreakers {
// upstream cluster. If not specified, the default is 3.
google.protobuf.UInt32Value max_retries = 5;

// Specifies a limit on concurrent retries in relation to the number of active requests. This
// parameter is optional.
//
// .. note::
//
// If this field is set, the retry budget will override any configured retry circuit
// breaker.
RetryBudget retry_budget = 8;

// If track_remaining is true, then stats will be published that expose
// the number of resources remaining until the circuit breakers open. If
// not specified, the default is false.
//
// .. note::
//
// If a retry budget is used in lieu of the max_retries circuit breaker,
// the remaining retry resources remaining will not be tracked.
bool track_remaining = 6;

// The maximum number of connection pools per cluster that Envoy will concurrently support at
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,9 @@ circuit_breakers.<cluster_name>.<priority>.max_requests

circuit_breakers.<cluster_name>.<priority>.max_retries
:ref:`Max retries circuit breaker setting <envoy_api_field_cluster.CircuitBreakers.Thresholds.max_retries>`

circuit_breakers.<cluster_name>.<priority>.retry_budget.budget_percent
:ref:`Max retries circuit breaker setting <envoy_api_field_cluster.CircuitBreakers.Thresholds.RetryBudget.budget_percent>`

circuit_breakers.<cluster_name>.<priority>.retry_budget.min_retry_concurrency
:ref:`Max retries circuit breaker setting <envoy_api_field_cluster.CircuitBreakers.Thresholds.RetryBudget.min_retry_concurrency>`
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Every cluster has a statistics tree rooted at *cluster.<name>.* with the followi
upstream_rq_tx_reset, Counter, Total requests that were reset locally
upstream_rq_retry, Counter, Total request retries
upstream_rq_retry_success, Counter, Total request retry successes
upstream_rq_retry_overflow, Counter, Total requests not retried due to circuit breaking
upstream_rq_retry_overflow, Counter, Total requests not retried due to circuit breaking or exceeding the retry budget
upstream_flow_control_paused_reading_total, Counter, Total number of times flow control paused reading from upstream
upstream_flow_control_resumed_reading_total, Counter, Total number of times flow control resumed reading from upstream
upstream_flow_control_backed_up_total, Counter, Total number of times the upstream connection backed up and paused reads from downstream
Expand Down
2 changes: 2 additions & 0 deletions docs/root/intro/arch_overview/http/http_routing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ headers <config_http_filters_router_headers_consumed>`. The following configurat
* **Retry conditions**: Envoy can retry on different types of conditions depending on application
requirements. For example, network failure, all 5xx response codes, idempotent 4xx response codes,
etc.
* **Retry budgets**: Envoy can limit the proportion of active requests via :ref:`retry budgets <envoy_api_field_cluster.CircuitBreakers.Thresholds.retry_budget>` that can be retries to
prevent their contribution to large increases in traffic volume.
* **Host selection retry plugins**: Envoy can be configured to apply additional logic to the host
selection logic when selecting hosts for retries. Specifying a
:ref:`retry host predicate <envoy_api_field_route.RetryPolicy.retry_host_predicate>`
Expand Down
4 changes: 2 additions & 2 deletions docs/root/intro/arch_overview/upstream/circuit_breaking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ configure and code each application independently. Envoy supports various types
overflows the :ref:`upstream_rq_pending_overflow <config_cluster_manager_cluster_stats>` counter
for the cluster will increment.
* **Cluster maximum active retries**: The maximum number of retries that can be outstanding to all
hosts in a cluster at any given time. In general we recommend aggressively circuit breaking
retries so that retries for sporadic failures are allowed but the overall retry volume cannot
hosts in a cluster at any given time. In general we recommend using :ref:`retry budgets <envoy_api_field_cluster.CircuitBreakers.Thresholds.retry_budget>`; however, if static circuit breaking is preferred it should aggressively circuit break
retries. This is so that retries for sporadic failures are allowed, but the overall retry volume cannot
explode and cause large scale cascading failure. If this circuit breaker overflows the
:ref:`upstream_rq_retry_overflow <config_cluster_manager_cluster_stats>` counter for the cluster
will increment.
Expand Down
10 changes: 5 additions & 5 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ Version history
* redis: performance improvement for larger split commands by avoiding string copies.
* redis: correctly follow MOVE/ASK redirection for mirrored clusters.
* redis: add :ref:`host_degraded_refresh_threshold <envoy_api_field_config.cluster.redis.RedisClusterConfig.host_degraded_refresh_threshold>` and :ref:`failure_refresh_threshold <envoy_api_field_config.cluster.redis.RedisClusterConfig.failure_refresh_threshold>` to refresh topology when nodes are degraded or when requests fails.
* router: added support for REQ(header-name) :ref:`header formatter <config_http_conn_man_headers_custom_request_headers>`.
* router check tool: added support for testing and marking coverage for routes of runtime fraction 0.
* router: added :ref:`request_mirror_policies<envoy_api_field_route.RouteAction.request_mirror_policies>` to support sending multiple mirrored requests in one route.
* router: allow using a :ref:`query parameter
<envoy_api_field_route.RouteAction.HashPolicy.query_parameter>` for HTTP consistent hashing.
* router: skip the Location header when the response code is not a 201 or a 3xx.
* router: added support for REQ(header-name) :ref:`header formatter <config_http_conn_man_headers_custom_request_headers>`.
* router: added support for percentage-based :ref:`retry budgets <envoy_api_field_cluster.CircuitBreakers.Thresholds.retry_budget>`
* router: allow using a :ref:`query parameter <envoy_api_field_route.RouteAction.HashPolicy.query_parameter>` for HTTP consistent hashing.
* router: exposed DOWNSTREAM_REMOTE_ADDRESS as custom HTTP request/response headers.
* router check tool: added support for testing and marking coverage for routes of runtime fraction 0.
* router: skip the Location header when the response code is not a 201 or a 3xx.
* server: added the :option:`--disable-extensions` CLI option, to disable extensions at startup.
* server: fixed a bug in config validation for configs with runtime layers.
* tcp_proxy: added :ref:`ClusterWeight.metadata_match<envoy_api_field_config.filter.network.tcp_proxy.v2.TcpProxy.WeightedCluster.ClusterWeight.metadata_match>`.
Expand Down
1 change: 1 addition & 0 deletions generated_api_shadow/envoy/api/v2/cluster/BUILD

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 31 additions & 1 deletion generated_api_shadow/envoy/api/v2/cluster/circuit_breaker.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions include/envoy/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ class Snapshot {
*/
virtual const std::string& get(const std::string& key) const PURE;

/**
* Returns whether the key has any value set.
* @param key supplies the key to check.
* @return bool if the key exists.
*/
virtual bool exists(const std::string& key) const PURE;

/**
* Fetch an integer runtime key. Runtime keys larger than ~2^53 may not be accurately converted
* into integers and will return default_value.
Expand Down
5 changes: 5 additions & 0 deletions include/envoy/upstream/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ class Resource {
* @return the current maximum allowed number of this resource.
*/
virtual uint64_t max() PURE;

/**
* @return the current resource count.
*/
virtual uint64_t count() const PURE;
};

/**
Expand Down
9 changes: 9 additions & 0 deletions source/common/http/http1/conn_pool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,20 @@ ConnPoolImpl::StreamWrapper::StreamWrapper(StreamDecoder& response_decoder, Acti
StreamEncoderWrapper::inner_.getStream().addCallbacks(*this);
parent_.parent_.host_->cluster().stats().upstream_rq_active_.inc();
parent_.parent_.host_->stats().rq_active_.inc();

// TODO (tonya11en): At the time of writing, there is no way to mix different versions of HTTP
// traffic in the same cluster, so incrementing the request count in the per-cluster resource
// manager will not affect circuit breaking in any unexpected ways. Ideally, outstanding requests
// counts would be tracked the same way in all HTTP versions.
//
// See: https://github.com/envoyproxy/envoy/issues/9215
parent_.parent_.host_->cluster().resourceManager(parent_.parent_.priority_).requests().inc();
}

ConnPoolImpl::StreamWrapper::~StreamWrapper() {
parent_.parent_.host_->cluster().stats().upstream_rq_active_.dec();
parent_.parent_.host_->stats().rq_active_.dec();
parent_.parent_.host_->cluster().resourceManager(parent_.parent_.priority_).requests().dec();
}

void ConnPoolImpl::StreamWrapper::onEncodeComplete() { encode_complete_ = true; }
Expand Down
1 change: 1 addition & 0 deletions source/common/runtime/runtime_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class SnapshotImpl : public Snapshot,
double getDouble(const std::string& key, double default_value) const override;
bool getBoolean(absl::string_view key, bool value) const override;
const std::vector<OverrideLayerConstPtr>& getLayers() const override;
bool exists(const std::string& key) const override { return values_.contains(key); }

static Entry createEntry(const std::string& value);
static Entry createEntry(const ProtobufWkt::Value& value);
Expand Down
Loading

0 comments on commit 3ed917f

Please sign in to comment.