envoyproxy · mattklein123 · Jan 7, 2020 · Nov 13, 2019 · Nov 18, 2019 · Nov 19, 2019
diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto
@@ -854,7 +854,7 @@ message RouteAction {
 }
 
 // HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
-// [#next-free-field: 11]
+// [#next-free-field: 12]
 message RetryPolicy {
   message RetryPriority {
     string name = 1 [(validate.rules).string = {min_bytes: 1}];
@@ -893,6 +893,20 @@ message RetryPolicy {
     google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {}}];
   }
 
+  message RetryBudget {
+    // Specifies the limit on concurrent retries as a percentage of outstanding requests. This
+    // parameter is optional.
+    //
+    // Defaults to 20%.
+    type.Percent percent_budget = 1;
+
+    // Specifies the minimum retry concurrency allowed for the retry budget. This parameter is
+    // optional.
+    //
+    // Defaults to 10.
+    google.protobuf.UInt32Value min_concurrency = 2;
+  }
+
   // Specifies the conditions under which retry takes place. These are the same
   // conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and
   // :ref:`config_http_filters_router_x-envoy-retry-grpc-on`.
@@ -949,6 +963,10 @@ message RetryPolicy {
 
   // HTTP headers which must be present in the request for retries to be attempted.
   repeated HeaderMatcher retriable_request_headers = 10;
+
+  // Specifies a limit on concurrent retries in relation to the number of active
+  // requests/connections. This parameter is optional.
+  RetryBudget retry_budget = 11;
 }
 
 // HTTP request hedging :ref:`architecture overview <arch_overview_http_routing_hedging>`.

diff --git a/api/envoy/api/v3alpha/route/route.proto b/api/envoy/api/v3alpha/route/route.proto
@@ -794,7 +794,7 @@ message RouteAction {
 }
 
 // HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
-// [#next-free-field: 11]
+// [#next-free-field: 12]
 message RetryPolicy {
   message RetryPriority {
     reserved 2;
@@ -837,6 +837,20 @@ message RetryPolicy {
     google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {}}];
   }
 
+  message RetryBudget {
+    // Specifies the limit on concurrent retries as a percentage of outstanding requests. This
+    // parameter is optional.
+    //
+    // Defaults to 20%.
+    type.v3alpha.Percent percent_budget = 1;
+
+    // Specifies the minimum retry concurrency allowed for the retry budget. This parameter is
+    // optional.
+    //
+    // Defaults to 10.
+    google.protobuf.UInt32Value min_concurrency = 2;
+  }
+
   // Specifies the conditions under which retry takes place. These are the same
   // conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and
   // :ref:`config_http_filters_router_x-envoy-retry-grpc-on`.
@@ -893,6 +907,10 @@ message RetryPolicy {
 
   // HTTP headers which must be present in the request for retries to be attempted.
   repeated HeaderMatcher retriable_request_headers = 10;
+
+  // Specifies a limit on concurrent retries in relation to the number of active
+  // requests/connections. This parameter is optional.
+  RetryBudget retry_budget = 11;
 }
 
 // HTTP request hedging :ref:`architecture overview <arch_overview_http_routing_hedging>`.

diff --git a/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst b/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst
@@ -73,6 +73,7 @@ Every cluster has a statistics tree rooted at *cluster.<name>.* with the followi
   upstream_rq_rx_reset, Counter, Total requests that were reset remotely
   upstream_rq_tx_reset, Counter, Total requests that were reset locally
   upstream_rq_retry, Counter, Total request retries
+  upstream_rq_retry_budget_exceeded, Counter, Total requests not retried due to retry budgets
   upstream_rq_retry_success, Counter, Total request retry successes
   upstream_rq_retry_overflow, Counter, Total requests not retried due to circuit breaking
   upstream_flow_control_paused_reading_total, Counter, Total number of times flow control paused reading from upstream

diff --git a/include/envoy/router/router.h b/include/envoy/router/router.h
@@ -164,6 +164,18 @@ class RetryPolicy {
   static const uint32_t RETRY_ON_RETRIABLE_HEADERS       = 0x1000;
   // clang-format on
 
+  /**
+   * Limitations placed on concurrent retries as a percentage of the number of active requests.
+   */
+  struct RetryBudget {
+    // The percentage of active requests that are allowed to be retries.
+    double budget_pct;
+
+    // The minimum number of active requests before enforcing the retry budget.
+    uint32_t min_concurrency;
+  };
+  using RetryBudget = struct RetryBudget;
+
   virtual ~RetryPolicy() = default;
 
   /**
@@ -227,6 +239,12 @@ class RetryPolicy {
    * @return absl::optional<std::chrono::milliseconds> maximum retry interval
    */
   virtual absl::optional<std::chrono::milliseconds> maxInterval() const PURE;
+
+  /**
+   * @return absl::optional<RetryBudget> limit on allowed concurrent retries in relation to current
+   * outstanding requests.
+   */
+  virtual absl::optional<RetryBudget> retryBudget() const PURE;
 };
 
 /**

diff --git a/include/envoy/upstream/resource_manager.h b/include/envoy/upstream/resource_manager.h
@@ -47,6 +47,11 @@ class Resource {
    * @return the current maximum allowed number of this resource.
    */
   virtual uint64_t max() PURE;
+
+  /**
+   * @return the current resource count.
+   */
+  virtual uint64_t count() PURE;
 };
 
 /**

diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h
@@ -568,6 +568,7 @@ class PrioritySet {
   COUNTER(upstream_rq_pending_total)                                                               \
   COUNTER(upstream_rq_per_try_timeout)                                                             \
   COUNTER(upstream_rq_retry)                                                                       \
+  COUNTER(upstream_rq_retry_budget_exceeded)                                                       \
   COUNTER(upstream_rq_retry_overflow)                                                              \
   COUNTER(upstream_rq_retry_success)                                                               \
   COUNTER(upstream_rq_rx_reset)                                                                    \

diff --git a/source/common/http/async_client_impl.h b/source/common/http/async_client_impl.h
@@ -143,6 +143,7 @@ class AsyncStreamImpl : public AsyncClient::Stream,
       return absl::nullopt;
     }
     absl::optional<std::chrono::milliseconds> maxInterval() const override { return absl::nullopt; }
+    absl::optional<RetryBudget> retryBudget() const override { return absl::nullopt; }
 
     const std::vector<uint32_t> retriable_status_codes_;
     const std::vector<Http::HeaderMatcherSharedPtr> retriable_headers_;

diff --git a/source/common/router/config_impl.cc b/source/common/router/config_impl.cc
@@ -121,6 +121,14 @@ RetryPolicyImpl::RetryPolicyImpl(const envoy::api::v2::route::RetryPolicy& retry
             "retry_policy.max_interval must greater than or equal to the base_interval");
       }
     }
+
+    if (retry_policy.has_retry_budget()) {
+      retry_budget_ = {};
+      retry_budget_->budget_pct =
+          PROTOBUF_PERCENT_TO_DOUBLE_OR_DEFAULT(retry_policy.retry_budget(), percent_budget, 20.0);
+      retry_budget_->min_concurrency =
+          PROTOBUF_GET_WRAPPED_OR_DEFAULT(retry_policy.retry_budget(), min_concurrency, 10);
+    }
   }
 }
 

diff --git a/source/common/router/config_impl.h b/source/common/router/config_impl.h
@@ -255,6 +255,7 @@ class RetryPolicyImpl : public RetryPolicy {
   }
   absl::optional<std::chrono::milliseconds> baseInterval() const override { return base_interval_; }
   absl::optional<std::chrono::milliseconds> maxInterval() const override { return max_interval_; }
+  absl::optional<RetryBudget> retryBudget() const override { return retry_budget_; }
 
 private:
   std::chrono::milliseconds per_try_timeout_{0};
@@ -274,6 +275,7 @@ class RetryPolicyImpl : public RetryPolicy {
   absl::optional<std::chrono::milliseconds> base_interval_;
   absl::optional<std::chrono::milliseconds> max_interval_;
   ProtobufMessage::ValidationVisitor* validation_visitor_{};
+  absl::optional<RetryBudget> retry_budget_;
 };
 
 /**

diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc
@@ -59,7 +59,8 @@ RetryStateImpl::RetryStateImpl(const RetryPolicy& route_policy, Http::HeaderMap&
       retry_priority_(route_policy.retryPriority()),
       retriable_status_codes_(route_policy.retriableStatusCodes()),
       retriable_headers_(route_policy.retriableHeaders()),
-      retriable_request_headers_(route_policy.retriableRequestHeaders()) {
+      retriable_request_headers_(route_policy.retriableRequestHeaders()),
+      retry_budget_(route_policy.retryBudget()) {
 
   retry_on_ = route_policy.retryOn();
   retries_remaining_ = std::max(retries_remaining_, route_policy.numRetries());
@@ -232,6 +233,11 @@ RetryStatus RetryStateImpl::shouldRetry(bool would_retry, DoRetryCallback callba
     return RetryStatus::No;
   }
 
+  if (retryBudgetExceeded()) {
+    cluster_.stats().upstream_rq_retry_budget_exceeded_.inc();
+    return RetryStatus::NoRetryLimitExceeded;
+  }
+
   ASSERT(!callback_);
   callback_ = callback;
   cluster_.resourceManager(priority_).retries().inc();
@@ -240,6 +246,25 @@ RetryStatus RetryStateImpl::shouldRetry(bool would_retry, DoRetryCallback callba
   return RetryStatus::Yes;
 }
 
+bool RetryStateImpl::retryBudgetExceeded() {
+  if (!retry_budget_) {
+    return false;
+  }
+
+  // If a retry budget was configured, we cannot exceed the configured percentage of total
+  // outstanding requests/connections.
+  const uint64_t current_active = cluster_.resourceManager(priority_).connections().count() +
+                                  cluster_.resourceManager(priority_).requests().count() +
+                                  cluster_.resourceManager(priority_).pendingRequests().count();
+
+  if (current_active < retry_budget_->min_concurrency) {
+    return false;
+  }
+
+  const double normalized_pct = retry_budget_->budget_pct / 100.0;
+  return cluster_.resourceManager(priority_).retries().count() >= normalized_pct * current_active;
+}
+
 RetryStatus RetryStateImpl::shouldRetryHeaders(const Http::HeaderMap& response_headers,
                                                DoRetryCallback callback) {
   return shouldRetry(wouldRetryFromHeaders(response_headers), callback);

diff --git a/source/common/router/retry_state_impl.h b/source/common/router/retry_state_impl.h
@@ -93,6 +93,7 @@ class RetryStateImpl : public RetryState {
   void enableBackoffTimer();
   void resetRetry();
   bool wouldRetryFromReset(const Http::StreamResetReason reset_reason);
+  bool retryBudgetExceeded();
   RetryStatus shouldRetry(bool would_retry, DoRetryCallback callback);
 
   const Upstream::ClusterInfo& cluster_;
@@ -111,6 +112,7 @@ class RetryStateImpl : public RetryState {
   std::vector<uint32_t> retriable_status_codes_;
   std::vector<Http::HeaderMatcherSharedPtr> retriable_headers_;
   std::vector<Http::HeaderMatcherSharedPtr> retriable_request_headers_;
+  absl::optional<RetryPolicy::RetryBudget> retry_budget_;
 };
 
 } // namespace Router

diff --git a/source/common/upstream/resource_manager_impl.h b/source/common/upstream/resource_manager_impl.h
@@ -72,6 +72,7 @@ class ResourceManagerImpl : public ResourceManager {
       open_gauge_.set(canCreate() ? 0 : 1);
     }
     uint64_t max() override { return runtime_.snapshot().getInteger(runtime_key_, max_); }
+    uint64_t count() override { return current_.load(); }
 
     /**
      * We set the gauge instead of incrementing and decrementing because,