Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upstream: Implement retry concurrency budgets #9069

Merged
merged 77 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
ee13f16
proto
tonya11en Nov 13, 2019
413a55a
first pass
Nov 18, 2019
43804fd
basic tests
Nov 19, 2019
c43f0cd
cleanup tests
Nov 19, 2019
24e247d
docs and stat
Nov 19, 2019
86cd4d5
more docs
Nov 19, 2019
eb75475
remove notes to self
Nov 19, 2019
4a4e502
format fixes
Nov 19, 2019
38c53c8
Kick CI
Nov 21, 2019
0970941
Merge remote-tracking branch 'upstream/master' into percentage_retries
Nov 21, 2019
d5ec251
memory test
Nov 21, 2019
8d8cde0
snow comments
Nov 25, 2019
e74e782
snow struct
tonya11en Nov 28, 2019
b11c614
ref
tonya11en Nov 28, 2019
87ca3a8
Merge remote-tracking branch 'upstream/master' into percentage_retries
tonya11en Nov 28, 2019
9a0c03f
Override retry CB if budgets configured
tonya11en Nov 29, 2019
f9137c9
format
tonya11en Nov 29, 2019
4a50f10
docs
Dec 2, 2019
ea64ad7
Merge remote-tracking branch 'upstream/master' into percentage_retries
Dec 2, 2019
3002bcd
stats integration
Dec 2, 2019
0430c19
format
Dec 2, 2019
8053b7e
nits
Dec 3, 2019
8f23bed
Account for HTTP1 requests
Dec 4, 2019
9f409e2
Merge remote-tracking branch 'upstream/master' into percentage_retries
Dec 4, 2019
91de3ce
format
tonya11en Dec 4, 2019
92a8188
Merge remote-tracking branch 'origin/master' into percent_retry
Dec 5, 2019
a50970e
wip
Dec 6, 2019
34bc98c
wip
Dec 7, 2019
18c5a98
builds
Dec 9, 2019
3378b64
failed test runs
Dec 10, 2019
baf91ac
bugs
Dec 10, 2019
193e537
retry state impl tests
tonya11en Dec 10, 2019
4fc3fb5
refactor
tonya11en Dec 10, 2019
f050c30
robust min concurrency test
tonya11en Dec 10, 2019
a2fb127
min concurrency fix
tonya11en Dec 10, 2019
3bbe93c
broken conn pool test
tonya11en Dec 10, 2019
f6162c8
test http1 active_rq increment
Dec 10, 2019
1b45679
test count
Dec 10, 2019
0d33ec3
format
Dec 10, 2019
49753ce
Merge remote-tracking branch 'upstream/master' into percentage_retries
Dec 10, 2019
b635e91
doc
Dec 10, 2019
1f02666
proto fix
Dec 10, 2019
6cee795
fix proto and DCO
Dec 10, 2019
7d11191
format after merge
Dec 10, 2019
7ce58e7
docs
Dec 10, 2019
c318a95
docs build
Dec 10, 2019
dde62da
format again
Dec 10, 2019
2344034
format again
Dec 10, 2019
51a8f33
make function part of impl
Dec 10, 2019
cf9c6ef
never ending fix format
Dec 11, 2019
9a8e947
Kick CI
Dec 11, 2019
0e100d1
matt comments
Dec 11, 2019
ef41c69
docs
Dec 12, 2019
81941c0
format
Dec 12, 2019
12198cd
Update retry_state_impl.cc
tonya11en Dec 12, 2019
74e5bbb
move into resource manager
tonya11en Dec 13, 2019
5285e3e
format
tonya11en Dec 13, 2019
d3ed226
docs
tonya11en Dec 13, 2019
659f480
Removed unused struct
Dec 13, 2019
61d7057
fix stats integration test
Dec 13, 2019
f70acbe
memory fix again
Dec 14, 2019
a77d1f2
memory test
tonya11en Dec 15, 2019
105dd0d
matt comments
Dec 20, 2019
0c46399
Merge remote-tracking branch 'origin/master' into pr
Dec 20, 2019
b9f46ec
broken mem test
Dec 20, 2019
c78c39e
Kick CI
Dec 20, 2019
bb9e81e
fix stats
Dec 21, 2019
d21677d
Merge remote-tracking branch 'upstream/master' into percentage_retries
tonya11en Dec 28, 2019
4cc50dd
order version history
tonya11en Dec 28, 2019
f9cffe0
fix merge issue
tonya11en Dec 28, 2019
94f63e8
Merge remote-tracking branch 'upstream/master' into percentage_retries
tonya11en Dec 29, 2019
4fb492a
format
tonya11en Dec 29, 2019
005317c
Merge remote-tracking branch 'upstream/master' into percentage_retries
tonya11en Dec 31, 2019
dc5f95e
proto format
tonya11en Dec 31, 2019
068e0e7
Merge remote-tracking branch 'upstream/master' into percentage_retries
tonya11en Jan 6, 2020
2446f7b
format
tonya11en Jan 6, 2020
da5fe31
nits
tonya11en Jan 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion api/envoy/api/v2/route/route.proto
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ message RouteAction {
}

// HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
// [#next-free-field: 11]
// [#next-free-field: 12]
message RetryPolicy {
message RetryPriority {
string name = 1 [(validate.rules).string = {min_bytes: 1}];
Expand Down Expand Up @@ -893,6 +893,20 @@ message RetryPolicy {
google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {}}];
}

message RetryBudget {
mattklein123 marked this conversation as resolved.
Show resolved Hide resolved
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
// Specifies the limit on concurrent retries as a percentage of outstanding requests. This
// parameter is optional.
//
// Defaults to 20%.
type.Percent percent_budget = 1;

// Specifies the minimum retry concurrency allowed for the retry budget. This parameter is
// optional.
//
// Defaults to 10.
google.protobuf.UInt32Value min_concurrency = 2;
mattklein123 marked this conversation as resolved.
Show resolved Hide resolved
}

// Specifies the conditions under which retry takes place. These are the same
// conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and
// :ref:`config_http_filters_router_x-envoy-retry-grpc-on`.
Expand Down Expand Up @@ -949,6 +963,10 @@ message RetryPolicy {

// HTTP headers which must be present in the request for retries to be attempted.
repeated HeaderMatcher retriable_request_headers = 10;

// Specifies a limit on concurrent retries in relation to the number of active
// requests/connections. This parameter is optional.
RetryBudget retry_budget = 11;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Right now we can set a retry policy on either the virtual host or the route, with the route overriding the virtual host. I don't feel strongly about this, but I'm wondering if it would be a better experience to allow the retry budget to be set independently from the policy. In this case, if the user sets a per-route policy, they will have to duplicate the budget across every policy because I don't think anything is merged from the virtual host. WDYT? @snowp?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have a cluster-level retry budget config and have each route optionally override it. It might get confusing to reason about where retry overflows are coming from, since we're using the retry_overflow counter for both the max_retries circuit breaker and the retry budget.

I can start adding this top-level budget to a different branch and defer to you as to whether it should go in this patch or a subsequent one along with runtime knobs and the solution to the HTTP/1.1 request tracking problem below.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think this is going to be problematic at Lyft? Or you think we can just configure this universally? I'm concerned that it's going to cause problems with us for our own usage right away?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty confident this can just be configured universally.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the config you have now how can we configure this universally?

}

// HTTP request hedging :ref:`architecture overview <arch_overview_http_routing_hedging>`.
Expand Down
20 changes: 19 additions & 1 deletion api/envoy/api/v3alpha/route/route.proto
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ message RouteAction {
}

// HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
// [#next-free-field: 11]
// [#next-free-field: 12]
message RetryPolicy {
message RetryPriority {
reserved 2;
Expand Down Expand Up @@ -837,6 +837,20 @@ message RetryPolicy {
google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {}}];
}

message RetryBudget {
// Specifies the limit on concurrent retries as a percentage of outstanding requests. This
// parameter is optional.
//
// Defaults to 20%.
type.v3alpha.Percent percent_budget = 1;

// Specifies the minimum retry concurrency allowed for the retry budget. This parameter is
// optional.
//
// Defaults to 10.
google.protobuf.UInt32Value min_concurrency = 2;
}

// Specifies the conditions under which retry takes place. These are the same
// conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and
// :ref:`config_http_filters_router_x-envoy-retry-grpc-on`.
Expand Down Expand Up @@ -893,6 +907,10 @@ message RetryPolicy {

// HTTP headers which must be present in the request for retries to be attempted.
repeated HeaderMatcher retriable_request_headers = 10;

// Specifies a limit on concurrent retries in relation to the number of active
// requests/connections. This parameter is optional.
RetryBudget retry_budget = 11;
}

// HTTP request hedging :ref:`architecture overview <arch_overview_http_routing_hedging>`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Every cluster has a statistics tree rooted at *cluster.<name>.* with the followi
upstream_rq_rx_reset, Counter, Total requests that were reset remotely
upstream_rq_tx_reset, Counter, Total requests that were reset locally
upstream_rq_retry, Counter, Total request retries
upstream_rq_retry_budget_exceeded, Counter, Total requests not retried due to retry budgets
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
upstream_rq_retry_success, Counter, Total request retry successes
upstream_rq_retry_overflow, Counter, Total requests not retried due to circuit breaking
upstream_flow_control_paused_reading_total, Counter, Total number of times flow control paused reading from upstream
Expand Down
18 changes: 18 additions & 0 deletions include/envoy/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ class RetryPolicy {
static const uint32_t RETRY_ON_RETRIABLE_HEADERS = 0x1000;
// clang-format on

/**
* Limitations placed on concurrent retries as a percentage of the number of active requests.
*/
struct RetryBudget {
// The percentage of active requests that are allowed to be retries.
double budget_pct;

// The minimum number of active requests before enforcing the retry budget.
uint32_t min_concurrency;
};
using RetryBudget = struct RetryBudget;
tonya11en marked this conversation as resolved.
Show resolved Hide resolved

virtual ~RetryPolicy() = default;

/**
Expand Down Expand Up @@ -227,6 +239,12 @@ class RetryPolicy {
* @return absl::optional<std::chrono::milliseconds> maximum retry interval
*/
virtual absl::optional<std::chrono::milliseconds> maxInterval() const PURE;

/**
* @return absl::optional<RetryBudget> limit on allowed concurrent retries in relation to current
* outstanding requests.
*/
virtual absl::optional<RetryBudget> retryBudget() const PURE;
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
};

/**
Expand Down
5 changes: 5 additions & 0 deletions include/envoy/upstream/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ class Resource {
* @return the current maximum allowed number of this resource.
*/
virtual uint64_t max() PURE;

/**
* @return the current resource count.
*/
virtual uint64_t count() PURE;
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
};

/**
Expand Down
1 change: 1 addition & 0 deletions include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ class PrioritySet {
COUNTER(upstream_rq_pending_total) \
COUNTER(upstream_rq_per_try_timeout) \
COUNTER(upstream_rq_retry) \
COUNTER(upstream_rq_retry_budget_exceeded) \
COUNTER(upstream_rq_retry_overflow) \
COUNTER(upstream_rq_retry_success) \
COUNTER(upstream_rq_rx_reset) \
Expand Down
1 change: 1 addition & 0 deletions source/common/http/async_client_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class AsyncStreamImpl : public AsyncClient::Stream,
return absl::nullopt;
}
absl::optional<std::chrono::milliseconds> maxInterval() const override { return absl::nullopt; }
absl::optional<RetryBudget> retryBudget() const override { return absl::nullopt; }

const std::vector<uint32_t> retriable_status_codes_;
const std::vector<Http::HeaderMatcherSharedPtr> retriable_headers_;
Expand Down
8 changes: 8 additions & 0 deletions source/common/router/config_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ RetryPolicyImpl::RetryPolicyImpl(const envoy::api::v2::route::RetryPolicy& retry
"retry_policy.max_interval must greater than or equal to the base_interval");
}
}

if (retry_policy.has_retry_budget()) {
retry_budget_ = {};
retry_budget_->budget_pct =
PROTOBUF_PERCENT_TO_DOUBLE_OR_DEFAULT(retry_policy.retry_budget(), percent_budget, 20.0);
retry_budget_->min_concurrency =
PROTOBUF_GET_WRAPPED_OR_DEFAULT(retry_policy.retry_budget(), min_concurrency, 10);
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions source/common/router/config_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ class RetryPolicyImpl : public RetryPolicy {
}
absl::optional<std::chrono::milliseconds> baseInterval() const override { return base_interval_; }
absl::optional<std::chrono::milliseconds> maxInterval() const override { return max_interval_; }
absl::optional<RetryBudget> retryBudget() const override { return retry_budget_; }

private:
std::chrono::milliseconds per_try_timeout_{0};
Expand All @@ -274,6 +275,7 @@ class RetryPolicyImpl : public RetryPolicy {
absl::optional<std::chrono::milliseconds> base_interval_;
absl::optional<std::chrono::milliseconds> max_interval_;
ProtobufMessage::ValidationVisitor* validation_visitor_{};
absl::optional<RetryBudget> retry_budget_;
};

/**
Expand Down
27 changes: 26 additions & 1 deletion source/common/router/retry_state_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ RetryStateImpl::RetryStateImpl(const RetryPolicy& route_policy, Http::HeaderMap&
retry_priority_(route_policy.retryPriority()),
retriable_status_codes_(route_policy.retriableStatusCodes()),
retriable_headers_(route_policy.retriableHeaders()),
retriable_request_headers_(route_policy.retriableRequestHeaders()) {
retriable_request_headers_(route_policy.retriableRequestHeaders()),
retry_budget_(route_policy.retryBudget()) {

retry_on_ = route_policy.retryOn();
retries_remaining_ = std::max(retries_remaining_, route_policy.numRetries());
Expand Down Expand Up @@ -232,6 +233,11 @@ RetryStatus RetryStateImpl::shouldRetry(bool would_retry, DoRetryCallback callba
return RetryStatus::No;
}

if (retryBudgetExceeded()) {
cluster_.stats().upstream_rq_retry_budget_exceeded_.inc();
return RetryStatus::NoRetryLimitExceeded;
}

ASSERT(!callback_);
callback_ = callback;
cluster_.resourceManager(priority_).retries().inc();
Expand All @@ -240,6 +246,25 @@ RetryStatus RetryStateImpl::shouldRetry(bool would_retry, DoRetryCallback callba
return RetryStatus::Yes;
}

bool RetryStateImpl::retryBudgetExceeded() {
if (!retry_budget_) {
return false;
}

// If a retry budget was configured, we cannot exceed the configured percentage of total
// outstanding requests/connections.
const uint64_t current_active = cluster_.resourceManager(priority_).connections().count() +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a bit iffy to me: today kinda this works before for H/1.1 and H/2 make use of either max_connections OR max_requests so this works, but there are requested features that might introduce max_connections to H/2 as well (#7403). Envoy also doesn't reclaim H/1.1 connections until it needs to, so I think using max_connections to get a sense for the current concurrency isn't perfect.

I think ideally we'd be counting # of requests for HTTP/1.1 and just do max_requests + max_pending, but I don't think we can just add that enforcement safely at this point (since users might have bumped max_con but not max_req). Maybe just note this limitation in the docs for now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noted in the docs. Let me know if the latest patch communicates this well enough.

cluster_.resourceManager(priority_).requests().count() +
cluster_.resourceManager(priority_).pendingRequests().count();

if (current_active < retry_budget_->min_concurrency) {
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
return false;
}

const double normalized_pct = retry_budget_->budget_pct / 100.0;
tonya11en marked this conversation as resolved.
Show resolved Hide resolved
return cluster_.resourceManager(priority_).retries().count() >= normalized_pct * current_active;
}

RetryStatus RetryStateImpl::shouldRetryHeaders(const Http::HeaderMap& response_headers,
DoRetryCallback callback) {
return shouldRetry(wouldRetryFromHeaders(response_headers), callback);
Expand Down
2 changes: 2 additions & 0 deletions source/common/router/retry_state_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class RetryStateImpl : public RetryState {
void enableBackoffTimer();
void resetRetry();
bool wouldRetryFromReset(const Http::StreamResetReason reset_reason);
bool retryBudgetExceeded();
RetryStatus shouldRetry(bool would_retry, DoRetryCallback callback);

const Upstream::ClusterInfo& cluster_;
Expand All @@ -111,6 +112,7 @@ class RetryStateImpl : public RetryState {
std::vector<uint32_t> retriable_status_codes_;
std::vector<Http::HeaderMatcherSharedPtr> retriable_headers_;
std::vector<Http::HeaderMatcherSharedPtr> retriable_request_headers_;
absl::optional<RetryPolicy::RetryBudget> retry_budget_;
};

} // namespace Router
Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/resource_manager_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class ResourceManagerImpl : public ResourceManager {
open_gauge_.set(canCreate() ? 0 : 1);
}
uint64_t max() override { return runtime_.snapshot().getInteger(runtime_key_, max_); }
uint64_t count() override { return current_.load(); }

/**
* We set the gauge instead of incrementing and decrementing because,
Expand Down
Loading