diff --git a/api/envoy/config/cluster/v3/cluster.proto b/api/envoy/config/cluster/v3/cluster.proto index 495d6ce39788..bcedfa509818 100644 --- a/api/envoy/config/cluster/v3/cluster.proto +++ b/api/envoy/config/cluster/v3/cluster.proto @@ -43,7 +43,7 @@ message ClusterCollection { } // Configuration for a single upstream cluster. -// [#next-free-field: 56] +// [#next-free-field: 57] message Cluster { option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.Cluster"; @@ -345,6 +345,35 @@ message Cluster { bool list_as_any = 7; } + // Configuration for :ref:`slow start mode `. + message SlowStartConfig { + // Represents the size of slow start window. + // If set, the newly created host remains in slow start mode starting from its creation time + // for the duration of slow start window. + google.protobuf.Duration slow_start_window = 1; + + // This parameter controls the speed of traffic increase over the slow start window. Defaults to 1.0, + // so that endpoint would get linearly increasing amount of traffic. + // When increasing the value for this parameter, the speed of traffic ramp-up increases non-linearly. + // The value of aggression parameter should be greater than 0.0. + // By tuning the parameter, is possible to achieve polynomial or exponential shape of ramp-up curve. + // + // During slow start window, effective weight of an endpoint would be scaled with time factor and aggression: + // `new_weight = weight * time_factor ^ (1 / aggression)`, + // where `time_factor=(time_since_start_seconds / slow_start_time_seconds)`. + // + // As time progresses, more and more traffic would be sent to endpoint, which is in slow start window. + // Once host exits slow start, time_factor and aggression no longer affect its weight. + core.v3.RuntimeDouble aggression = 2; + } + + // Specific configuration for the RoundRobin load balancing policy. + message RoundRobinLbConfig { + // Configuration for slow start mode. + // If this configuration is not set, slow start will not be not enabled. + SlowStartConfig slow_start_config = 1; + } + // Specific configuration for the LeastRequest load balancing policy. message LeastRequestLbConfig { option (udpa.annotations.versioning).previous_message_type = @@ -378,6 +407,10 @@ message Cluster { // .. note:: // This setting only takes effect if all host weights are not equal. core.v3.RuntimeDouble active_request_bias = 2; + + // Configuration for slow start mode. + // If this configuration is not set, slow start will not be not enabled. + SlowStartConfig slow_start_config = 3; } // Specific configuration for the :ref:`RingHash` @@ -959,6 +992,9 @@ message Cluster { // Optional configuration for the LeastRequest load balancing policy. LeastRequestLbConfig least_request_lb_config = 37; + + // Optional configuration for the RoundRobin load balancing policy. + RoundRobinLbConfig round_robin_lb_config = 56; } // Common configuration for all load balancer implementations. diff --git a/docs/root/_static/slow_start_aggression.svg b/docs/root/_static/slow_start_aggression.svg new file mode 100644 index 000000000000..aac119a0b335 --- /dev/null +++ b/docs/root/_static/slow_start_aggression.svg @@ -0,0 +1,2049 @@ + + + + + + + + 2021-04-26T00:13:24.988771 + image/svg+xml + + + Matplotlib v3.4.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/root/_static/slow_start_example.svg b/docs/root/_static/slow_start_example.svg new file mode 100644 index 000000000000..9bd88ce1401f --- /dev/null +++ b/docs/root/_static/slow_start_example.svg @@ -0,0 +1,1053 @@ + + + + + + + + 2021-09-10T13:39:07.873353 + image/svg+xml + + + Matplotlib v3.4.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst index 36e0fddd3ca8..de648a4b8c73 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst @@ -15,3 +15,4 @@ Load Balancing original_dst zone_aware subsets + slow_start diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/slow_start.rst b/docs/root/intro/arch_overview/upstream/load_balancing/slow_start.rst new file mode 100644 index 000000000000..e510f6698255 --- /dev/null +++ b/docs/root/intro/arch_overview/upstream/load_balancing/slow_start.rst @@ -0,0 +1,60 @@ +.. _arch_overview_load_balancing_slow_start: + +Slow start mode +=============== + +Slow start mode is a configuration setting in Envoy to progressively increase amount of traffic for newly added upstream endpoints. +With no slow start enabled Envoy would send a proportional amount of traffic to new upstream endpoints. +This could be undesirable for services that require warm up time to serve full production load and could result in request timeouts, loss of data and deteriorated user experience. + +Slow start mode is a mechanism that affects load balancing weight of upstream endpoints and can be configured per upstream cluster. +Currently, slow start is supported in :ref:`Round Robin ` and :ref:`Least Request ` load balancer types. + +Users can specify a :ref:`slow start window parameter` (in seconds), so that if endpoint "cluster membership duration" (amount of time since it has joined the cluster) is within the configured window, it enters slow start mode. +During slow start window, load balancing weight of a particular endpoint will be scaled with time factor, e.g.: + +.. math:: + + NewWeight = {Weight*TimeFactor}^\frac{1}{Aggression} + +where, + +.. math:: + + TimeFactor = \frac{max(TimeSinceStartInSeconds,1)}{SlowStartWindowInSeconds} + +As time progresses, more and more traffic would be sent to endpoint within slow start window. + +:ref:`Aggression parameter` non-linearly affects endpoint weight and represents the speed of ramp-up. +By tuning aggression parameter, one could achieve polynomial or exponential speed for traffic increase. +Below simulation demonstrates how various values for aggression affect traffic ramp-up: + +.. image:: /_static/slow_start_aggression.svg + :width: 60% + :align: center + +Whenever a slow start window duration elapses, upstream endpoint exits slow start mode and gets regular amount of traffic according to load balancing algorithm. +Its load balancing weight will no longer be scaled with runtime bias and aggression. Endpoint could also exit slow start mode in case it leaves the cluster. + +To reiterate, endpoint enters slow start mode: + * If no active healthcheck is configured per cluster, immediately if its cluster membership duration is within slow start window. + * In case an active healthcheck is configured per cluster, when its cluster membership duration is within slow start window and endpoint has passed an active healthcheck. + If endpoint does not pass an active healthcheck during entire slow start window (since it has been added to upstream cluster), then it never enters slow start mode. + +Endpoint exits slow start mode when: + * It leaves the cluster. + * Its cluster membership duration is greater than slow start window. + * It does not pass an active healthcheck configured per cluster. + Endpoint could further re-enter slow start, if it passes an active healthcheck and its creation time is within slow start window. + +It is not recommended enabling slow start mode in low traffic or high number of endpoints scenarios, potential drawbacks would be: + * Endpoint starvation, where endpoint has low probability to receive a request either due to low traffic or high number of total endpoints. + * Spurious (non-gradual) increase of traffic per endpoint, whenever a starving endpoint receives a request and sufficient time has passed within slow start window, + its load balancing weight will increase non linearly due to time factor. + +Below is an example of how result load balancing weight would look like for endpoints in same priority with Round Robin Loadbalancer type, slow start window of 60 seconds, no active healthcheck and 1.0 aggression. +Once endpoints E1 and E2 exit slow start mode, their load balancing weight remains constant: + +.. image:: /_static/slow_start_example.svg + :width: 60% + :align: center diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index 48df3345b5b6..11cbcc649c03 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -136,6 +136,10 @@ New Features * thrift_proxy: added support for :ref:`mirroring requests `. * udp: allows updating filter chain in-place through LDS, which is supported by Quic listener. Such listener config will be rejected in other connection-less UDP listener implementations. It can be reverted by ``envoy.reloadable_features.udp_listener_updates_filter_chain_in_place``. * udp: disallow L4 filter chain in config which configures connection-less UDP listener. It can be reverted by ``envoy.reloadable_features.udp_listener_updates_filter_chain_in_place``. +* upstream: added support for :ref:`slow start mode `, which allows to progresively increase traffic for new endpoints. +* upstream: extended :ref:`Round Robin load balancer configuration ` with :ref:`slow start ` support. +* upstream: extended :ref:`Least Request load balancer configuration ` with :ref:`slow start ` support. + Deprecated ---------- diff --git a/envoy/upstream/upstream.h b/envoy/upstream/upstream.h index caebef0f4e88..732e48ef5f46 100644 --- a/envoy/upstream/upstream.h +++ b/envoy/upstream/upstream.h @@ -835,6 +835,12 @@ class ClusterInfo { virtual const absl::optional& clusterType() const PURE; + /** + * @return configuration for round robin load balancing, only used if LB type is round robin. + */ + virtual const absl::optional& + lbRoundRobinConfig() const PURE; + /** * @return configuration for least request load balancing, only used if LB type is least request. */ diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index f7a11ba8b27c..f9d91c46e906 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -1339,14 +1339,16 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::ClusterEntry( cluster->lbType(), priority_set_, parent_.local_priority_set_, cluster->stats(), cluster->statsScope(), parent.parent_.runtime_, parent.parent_.random_, cluster->lbSubsetInfo(), cluster->lbRingHashConfig(), cluster->lbMaglevConfig(), - cluster->lbLeastRequestConfig(), cluster->lbConfig()); + cluster->lbRoundRobinConfig(), cluster->lbLeastRequestConfig(), cluster->lbConfig(), + parent_.thread_local_dispatcher_.timeSource()); } else { switch (cluster->lbType()) { case LoadBalancerType::LeastRequest: { ASSERT(lb_factory_ == nullptr); lb_ = std::make_unique( priority_set_, parent_.local_priority_set_, cluster->stats(), parent.parent_.runtime_, - parent.parent_.random_, cluster->lbConfig(), cluster->lbLeastRequestConfig()); + parent.parent_.random_, cluster->lbConfig(), cluster->lbLeastRequestConfig(), + parent.thread_local_dispatcher_.timeSource()); break; } case LoadBalancerType::Random: { @@ -1358,9 +1360,10 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::ClusterEntry( } case LoadBalancerType::RoundRobin: { ASSERT(lb_factory_ == nullptr); - lb_ = std::make_unique(priority_set_, parent_.local_priority_set_, - cluster->stats(), parent.parent_.runtime_, - parent.parent_.random_, cluster->lbConfig()); + lb_ = std::make_unique( + priority_set_, parent_.local_priority_set_, cluster->stats(), parent.parent_.runtime_, + parent.parent_.random_, cluster->lbConfig(), cluster->lbRoundRobinConfig(), + parent.thread_local_dispatcher_.timeSource()); break; } case LoadBalancerType::ClusterProvided: diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 52619e57f039..5cd361b1da58 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -11,6 +11,7 @@ #include "envoy/upstream/upstream.h" #include "source/common/common/assert.h" +#include "source/common/common/logger.h" #include "source/common/protobuf/utility.h" #include "absl/container/fixed_array.h" @@ -754,10 +755,21 @@ const HostVector& ZoneAwareLoadBalancerBase::hostSourceToHosts(HostsSource hosts EdfLoadBalancerBase::EdfLoadBalancerBase( const PrioritySet& priority_set, const PrioritySet* local_priority_set, ClusterStats& stats, Runtime::Loader& runtime, Random::RandomGenerator& random, - const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config) + const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, + const absl::optional slow_start_config, + TimeSource& time_source) : ZoneAwareLoadBalancerBase(priority_set, local_priority_set, stats, runtime, random, common_config), - seed_(random_.random()) { + seed_(random_.random()), + slow_start_window_(slow_start_config.has_value() + ? std::chrono::milliseconds(DurationUtil::durationToMilliseconds( + slow_start_config.value().slow_start_window())) + : std::chrono::milliseconds(0)), + aggression_runtime_( + slow_start_config.has_value() && slow_start_config.value().has_aggression() + ? absl::optional({slow_start_config.value().aggression(), runtime}) + : absl::nullopt), + time_source_(time_source), latest_host_added_time_(time_source_.monotonicTime()) { // We fully recompute the schedulers for a given host set here on membership change, which is // consistent with what other LB implementations do (e.g. thread aware). // The downside of a full recompute is that time complexity is O(n * log n), @@ -765,6 +777,12 @@ EdfLoadBalancerBase::EdfLoadBalancerBase( // https://github.com/envoyproxy/envoy/issues/2874). priority_update_cb_ = priority_set.addPriorityUpdateCb( [this](uint32_t priority, const HostVector&, const HostVector&) { refresh(priority); }); + member_update_cb_ = priority_set.addMemberUpdateCb( + [this](const HostVector& hosts_added, const HostVector&) -> void { + if (isSlowStartEnabled()) { + recalculateHostsInSlowStart(hosts_added); + } + }); } void EdfLoadBalancerBase::initialize() { @@ -773,20 +791,38 @@ void EdfLoadBalancerBase::initialize() { } } +void EdfLoadBalancerBase::recalculateHostsInSlowStart(const HostVector& hosts) { + auto current_time = time_source_.monotonicTime(); + // TODO(nezdolik): linear scan can be improved with using flat hash set for hosts in slow start. + for (const auto& host : hosts) { + auto host_create_duration = + std::chrono::duration_cast(current_time - host->creationTime()); + // Check if host existence time is within slow start window. + if (host->creationTime() > latest_host_added_time_ && + host_create_duration <= slow_start_window_ && + host->health() == Upstream::Host::Health::Healthy) { + latest_host_added_time_ = host->creationTime(); + } + } +} + void EdfLoadBalancerBase::refresh(uint32_t priority) { const auto add_hosts_source = [this](HostsSource source, const HostVector& hosts) { // Nuke existing scheduler if it exists. auto& scheduler = scheduler_[source] = Scheduler{}; refreshHostSource(source); + if (isSlowStartEnabled()) { + recalculateHostsInSlowStart(hosts); + } - // Check if the original host weights are equal and skip EDF creation if they are. When all - // original weights are equal we can rely on unweighted host pick to do optimal round robin and - // least-loaded host selection with lower memory and CPU overhead. - if (hostWeightsAreEqual(hosts)) { + // Check if the original host weights are equal and no hosts are in slow start mode, in that + // case EDF creation is skipped. When all original weights are equal and no hosts are in slow + // start mode we can rely on unweighted host pick to do optimal round robin and least-loaded + // host selection with lower memory and CPU overhead. + if (hostWeightsAreEqual(hosts) && noHostsAreInSlowStart()) { // Skip edf creation. return; } - scheduler.edf_ = std::make_unique>(); // Populate scheduler with host list. @@ -812,7 +848,6 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { } } }; - // Populate EdfSchedulers for each valid HostsSource value for the host set at this priority. const auto& host_set = priority_set_.hostSetsPerPriority()[priority]; add_hosts_source(HostsSource(priority, HostsSource::SourceType::AllHosts), host_set->hosts()); @@ -834,6 +869,22 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { } } +bool EdfLoadBalancerBase::isSlowStartEnabled() { + return slow_start_window_ > std::chrono::milliseconds(0); +} + +bool EdfLoadBalancerBase::noHostsAreInSlowStart() { + if (!isSlowStartEnabled()) { + return true; + } + auto current_time = time_source_.monotonicTime(); + if (std::chrono::duration_cast( + current_time - latest_host_added_time_) <= slow_start_window_) { + return false; + } + return true; +} + HostConstSharedPtr EdfLoadBalancerBase::peekAnotherHost(LoadBalancerContext* context) { if (tooManyPreconnects(stashed_random_.size(), total_healthy_hosts_)) { return nullptr; @@ -892,6 +943,36 @@ HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* cont } } +double EdfLoadBalancerBase::applyAggressionFactor(double time_factor) { + if (aggression_ == 1.0 || time_factor == 1.0) { + return time_factor; + } else { + return std::pow(time_factor, 1.0 / aggression_); + } +} + +double EdfLoadBalancerBase::applySlowStartFactor(double host_weight, const Host& host) { + auto host_create_duration = std::chrono::duration_cast( + time_source_.monotonicTime() - host.creationTime()); + if (host_create_duration < slow_start_window_ && + host.health() == Upstream::Host::Health::Healthy) { + aggression_ = aggression_runtime_ != absl::nullopt ? aggression_runtime_.value().value() : 1.0; + if (aggression_ < 0.0) { + ENVOY_LOG_EVERY_POW_2(error, "Invalid runtime value provided for aggression parameter, " + "agression cannot be less than 0.0"); + } + aggression_ = std::max(0.0, aggression_); + + ASSERT(aggression_ > 0.0); + auto time_factor = static_cast(std::max(std::chrono::milliseconds(1).count(), + host_create_duration.count())) / + slow_start_window_.count(); + return host_weight * applyAggressionFactor(time_factor); + } else { + return host_weight; + } +} + HostConstSharedPtr LeastRequestLoadBalancer::unweightedHostPeek(const HostVector&, const HostsSource&) { // LeastRequestLoadBalancer can not do deterministic preconnecting, because @@ -903,11 +984,13 @@ HostConstSharedPtr LeastRequestLoadBalancer::unweightedHostPeek(const HostVector HostConstSharedPtr LeastRequestLoadBalancer::unweightedHostPick(const HostVector& hosts_to_use, const HostsSource&) { HostSharedPtr candidate_host = nullptr; + for (uint32_t choice_idx = 0; choice_idx < choice_count_; ++choice_idx) { const int rand_idx = random_.random() % hosts_to_use.size(); HostSharedPtr sampled_host = hosts_to_use[rand_idx]; if (candidate_host == nullptr) { + // Make a first choice to start the comparisons. candidate_host = sampled_host; continue; diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index f38e3f576516..6cd0c3710920 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -387,12 +387,15 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { * This base class also supports unweighted selection which derived classes can use to customize * behavior. Derived classes can also override how host weight is determined when in weighted mode. */ -class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { +class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase, + Logger::Loggable { public: - EdfLoadBalancerBase(const PrioritySet& priority_set, const PrioritySet* local_priority_set, - ClusterStats& stats, Runtime::Loader& runtime, - Random::RandomGenerator& random, - const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config); + EdfLoadBalancerBase( + const PrioritySet& priority_set, const PrioritySet* local_priority_set, ClusterStats& stats, + Runtime::Loader& runtime, Random::RandomGenerator& random, + const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, + const absl::optional slow_start_cofig, + TimeSource& time_source); // Upstream::ZoneAwareLoadBalancerBase HostConstSharedPtr peekAnotherHost(LoadBalancerContext* context) override; @@ -410,6 +413,11 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { virtual void refresh(uint32_t priority); + bool isSlowStartEnabled(); + bool noHostsAreInSlowStart(); + + virtual void recalculateHostsInSlowStart(const HostVector& hosts_added); + // Seed to allow us to desynchronize load balancers across a fleet. If we don't // do this, multiple Envoys that receive an update at the same time (or even // multiple load balancers on the same host) will send requests to @@ -417,7 +425,11 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { // overload. const uint64_t seed_; + double applyAggressionFactor(double time_factor); + double applySlowStartFactor(double host_weight, const Host& host); + private: + friend class EdfLoadBalancerBasePeer; virtual void refreshHostSource(const HostsSource& source) PURE; virtual double hostWeight(const Host& host) PURE; virtual HostConstSharedPtr unweightedHostPeek(const HostVector& hosts_to_use, @@ -428,6 +440,15 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { // Scheduler for each valid HostsSource. absl::node_hash_map scheduler_; Common::CallbackHandlePtr priority_update_cb_; + Common::CallbackHandlePtr member_update_cb_; + +protected: + // Slow start related config + const std::chrono::milliseconds slow_start_window_; + double aggression_{1.0}; + const absl::optional aggression_runtime_; + TimeSource& time_source_; + MonotonicTime latest_host_added_time_; }; /** @@ -436,12 +457,20 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { */ class RoundRobinLoadBalancer : public EdfLoadBalancerBase { public: - RoundRobinLoadBalancer(const PrioritySet& priority_set, const PrioritySet* local_priority_set, - ClusterStats& stats, Runtime::Loader& runtime, - Random::RandomGenerator& random, - const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config) - : EdfLoadBalancerBase(priority_set, local_priority_set, stats, runtime, random, - common_config) { + RoundRobinLoadBalancer( + const PrioritySet& priority_set, const PrioritySet* local_priority_set, ClusterStats& stats, + Runtime::Loader& runtime, Random::RandomGenerator& random, + const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, + const absl::optional + round_robin_config, + TimeSource& time_source) + : EdfLoadBalancerBase( + priority_set, local_priority_set, stats, runtime, random, common_config, + (round_robin_config.has_value() && round_robin_config.value().has_slow_start_config()) + ? absl::optional( + round_robin_config.value().slow_start_config()) + : absl::nullopt, + time_source) { initialize(); } @@ -455,7 +484,13 @@ class RoundRobinLoadBalancer : public EdfLoadBalancerBase { // index. peekahead_index_ = 0; } - double hostWeight(const Host& host) override { return host.weight(); } + double hostWeight(const Host& host) override { + if (!noHostsAreInSlowStart()) { + return applySlowStartFactor(host.weight(), host); + } + return host.weight(); + } + HostConstSharedPtr unweightedHostPeek(const HostVector& hosts_to_use, const HostsSource& source) override { auto i = rr_indexes_.find(source); @@ -498,37 +533,45 @@ class RoundRobinLoadBalancer : public EdfLoadBalancerBase { * The benefit of the Maglev table is at the expense of resolution, memory usage is capped. * Additionally, the Maglev table can be shared amongst all threads. */ -class LeastRequestLoadBalancer : public EdfLoadBalancerBase, - Logger::Loggable { +class LeastRequestLoadBalancer : public EdfLoadBalancerBase { public: LeastRequestLoadBalancer( const PrioritySet& priority_set, const PrioritySet* local_priority_set, ClusterStats& stats, Runtime::Loader& runtime, Random::RandomGenerator& random, const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, const absl::optional - least_request_config) - : EdfLoadBalancerBase(priority_set, local_priority_set, stats, runtime, random, - common_config), + least_request_config, + TimeSource& time_source) + : EdfLoadBalancerBase( + priority_set, local_priority_set, stats, runtime, random, common_config, + (least_request_config.has_value() && + least_request_config.value().has_slow_start_config()) + ? absl::optional( + least_request_config.value().slow_start_config()) + : absl::nullopt, + time_source), choice_count_( least_request_config.has_value() ? PROTOBUF_GET_WRAPPED_OR_DEFAULT(least_request_config.value(), choice_count, 2) : 2), active_request_bias_runtime_( least_request_config.has_value() && least_request_config->has_active_request_bias() - ? std::make_unique(least_request_config->active_request_bias(), - runtime) - : nullptr) { + ? absl::optional( + {least_request_config->active_request_bias(), runtime}) + : absl::nullopt) { initialize(); } protected: void refresh(uint32_t priority) override { - active_request_bias_ = - active_request_bias_runtime_ != nullptr ? active_request_bias_runtime_->value() : 1.0; + active_request_bias_ = active_request_bias_runtime_ != absl::nullopt + ? active_request_bias_runtime_.value().value() + : 1.0; if (active_request_bias_ < 0.0) { - ENVOY_LOG(warn, "upstream: invalid active request bias supplied (runtime key {}), using 1.0", - active_request_bias_runtime_->runtimeKey()); + ENVOY_LOG_MISC(warn, + "upstream: invalid active request bias supplied (runtime key {}), using 1.0", + active_request_bias_runtime_->runtimeKey()); active_request_bias_ = 1.0; } @@ -555,16 +598,21 @@ class LeastRequestLoadBalancer : public EdfLoadBalancerBase, // // It might be possible to do better by picking two hosts off of the schedule, and selecting the // one with fewer active requests at the time of selection. - if (active_request_bias_ == 0.0) { - return host.weight(); - } + + double host_weight = static_cast(host.weight()); if (active_request_bias_ == 1.0) { - return static_cast(host.weight()) / (host.stats().rq_active_.value() + 1); + host_weight = static_cast(host.weight()) / (host.stats().rq_active_.value() + 1); + } else if (active_request_bias_ != 0.0) { + host_weight = static_cast(host.weight()) / + std::pow(host.stats().rq_active_.value() + 1, active_request_bias_); } - return static_cast(host.weight()) / - std::pow(host.stats().rq_active_.value() + 1, active_request_bias_); + if (!noHostsAreInSlowStart()) { + return applySlowStartFactor(host_weight, host); + } else { + return host_weight; + } } HostConstSharedPtr unweightedHostPeek(const HostVector& hosts_to_use, const HostsSource& source) override; @@ -578,13 +626,14 @@ class LeastRequestLoadBalancer : public EdfLoadBalancerBase, // whenever a `HostSet` is updated. double active_request_bias_{}; - const std::unique_ptr active_request_bias_runtime_; + const absl::optional active_request_bias_runtime_; }; /** * Random load balancer that picks a random host out of all hosts. */ -class RandomLoadBalancer : public ZoneAwareLoadBalancerBase { +class RandomLoadBalancer : public ZoneAwareLoadBalancerBase, + Logger::Loggable { public: RandomLoadBalancer(const PrioritySet& priority_set, const PrioritySet* local_priority_set, ClusterStats& stats, Runtime::Loader& runtime, Random::RandomGenerator& random, diff --git a/source/common/upstream/subset_lb.cc b/source/common/upstream/subset_lb.cc index 4c5a420a9450..b5bf551cc667 100644 --- a/source/common/upstream/subset_lb.cc +++ b/source/common/upstream/subset_lb.cc @@ -26,19 +26,23 @@ SubsetLoadBalancer::SubsetLoadBalancer( const absl::optional& lb_ring_hash_config, const absl::optional& lb_maglev_config, + const absl::optional& + round_robin_config, const absl::optional& least_request_config, - const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config) + const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, + TimeSource& time_source) : lb_type_(lb_type), lb_ring_hash_config_(lb_ring_hash_config), - lb_maglev_config_(lb_maglev_config), least_request_config_(least_request_config), - common_config_(common_config), stats_(stats), scope_(scope), runtime_(runtime), - random_(random), fallback_policy_(subsets.fallbackPolicy()), + lb_maglev_config_(lb_maglev_config), round_robin_config_(round_robin_config), + least_request_config_(least_request_config), common_config_(common_config), stats_(stats), + scope_(scope), runtime_(runtime), random_(random), fallback_policy_(subsets.fallbackPolicy()), default_subset_metadata_(subsets.defaultSubset().fields().begin(), subsets.defaultSubset().fields().end()), subset_selectors_(subsets.subsetSelectors()), original_priority_set_(priority_set), original_local_priority_set_(local_priority_set), locality_weight_aware_(subsets.localityWeightAware()), - scale_locality_weight_(subsets.scaleLocalityWeight()), list_as_any_(subsets.listAsAny()) { + scale_locality_weight_(subsets.scaleLocalityWeight()), list_as_any_(subsets.listAsAny()), + time_source_(time_source) { ASSERT(subsets.isEnabled()); if (fallback_policy_ != envoy::config::cluster::v3::Cluster::LbSubsetConfig::NO_FALLBACK) { @@ -751,7 +755,8 @@ SubsetLoadBalancer::PrioritySubsetImpl::PrioritySubsetImpl(const SubsetLoadBalan case LoadBalancerType::LeastRequest: lb_ = std::make_unique( *this, subset_lb.original_local_priority_set_, subset_lb.stats_, subset_lb.runtime_, - subset_lb.random_, subset_lb.common_config_, subset_lb.least_request_config_); + subset_lb.random_, subset_lb.common_config_, subset_lb.least_request_config_, + subset_lb.time_source_); break; case LoadBalancerType::Random: @@ -761,9 +766,10 @@ SubsetLoadBalancer::PrioritySubsetImpl::PrioritySubsetImpl(const SubsetLoadBalan break; case LoadBalancerType::RoundRobin: - lb_ = std::make_unique(*this, subset_lb.original_local_priority_set_, - subset_lb.stats_, subset_lb.runtime_, - subset_lb.random_, subset_lb.common_config_); + lb_ = std::make_unique( + *this, subset_lb.original_local_priority_set_, subset_lb.stats_, subset_lb.runtime_, + subset_lb.random_, subset_lb.common_config_, subset_lb.round_robin_config_, + subset_lb.time_source_); break; case LoadBalancerType::RingHash: diff --git a/source/common/upstream/subset_lb.h b/source/common/upstream/subset_lb.h index 354341ff060c..1be1830aa587 100644 --- a/source/common/upstream/subset_lb.h +++ b/source/common/upstream/subset_lb.h @@ -30,9 +30,12 @@ class SubsetLoadBalancer : public LoadBalancer, Logger::Loggable& lb_ring_hash_config, const absl::optional& lb_maglev_config, + const absl::optional& + round_robin_config, const absl::optional& least_request_config, - const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config); + const envoy::config::cluster::v3::Cluster::CommonLbConfig& common_config, + TimeSource& time_source); ~SubsetLoadBalancer() override; // Upstream::LoadBalancer @@ -239,6 +242,7 @@ class SubsetLoadBalancer : public LoadBalancer, Logger::Loggable lb_ring_hash_config_; const absl::optional lb_maglev_config_; + const absl::optional round_robin_config_; const absl::optional least_request_config_; const envoy::config::cluster::v3::Cluster::CommonLbConfig common_config_; @@ -280,6 +284,8 @@ class SubsetLoadBalancer : public LoadBalancer, Logger::Loggable& + lbRoundRobinConfig() const override { + return lb_round_robin_config_; + } const absl::optional& lbLeastRequestConfig() const override { return lb_least_request_config_; @@ -779,6 +783,7 @@ class ClusterInfoImpl : public ClusterInfo, const std::string maintenance_mode_runtime_key_; const Network::Address::InstanceConstSharedPtr source_address_; LoadBalancerType lb_type_; + absl::optional lb_round_robin_config_; absl::optional lb_least_request_config_; absl::optional lb_ring_hash_config_; diff --git a/test/common/upstream/BUILD b/test/common/upstream/BUILD index 684e16be97e7..8cef07880497 100644 --- a/test/common/upstream/BUILD +++ b/test/common/upstream/BUILD @@ -827,6 +827,7 @@ envoy_proto_library( srcs = ["round_robin_load_balancer_fuzz.proto"], deps = [ "//test/common/upstream:zone_aware_load_balancer_fuzz_proto", + "@envoy_api//envoy/config/cluster/v3:pkg", ], ) diff --git a/test/common/upstream/least_request_load_balancer_fuzz_test.cc b/test/common/upstream/least_request_load_balancer_fuzz_test.cc index 85b0689f4d1e..2bc4958d44e4 100644 --- a/test/common/upstream/least_request_load_balancer_fuzz_test.cc +++ b/test/common/upstream/least_request_load_balancer_fuzz_test.cc @@ -65,7 +65,7 @@ DEFINE_PROTO_FUZZER(const test::common::upstream::LeastRequestLoadBalancerTestCa zone_aware_load_balancer_fuzz.stats_, zone_aware_load_balancer_fuzz.runtime_, zone_aware_load_balancer_fuzz.random_, zone_aware_load_balancer_test_case.load_balancer_test_case().common_lb_config(), - input.least_request_lb_config()); + input.least_request_lb_config(), zone_aware_load_balancer_fuzz.simTime()); } catch (EnvoyException& e) { ENVOY_LOG_MISC(debug, "EnvoyException; {}", e.what()); removeRequestsActiveForStaticHosts(zone_aware_load_balancer_fuzz.priority_set_); diff --git a/test/common/upstream/load_balancer_benchmark.cc b/test/common/upstream/load_balancer_benchmark.cc index 22f58f0f9337..6a855bae9aac 100644 --- a/test/common/upstream/load_balancer_benchmark.cc +++ b/test/common/upstream/load_balancer_benchmark.cc @@ -71,6 +71,7 @@ class BaseTester : public Event::TestUsingSimulatedTime { NiceMock runtime_; Random::RandomGeneratorImpl random_; envoy::config::cluster::v3::Cluster::CommonLbConfig common_config_; + envoy::config::cluster::v3::Cluster::RoundRobinLbConfig round_robin_lb_config_; std::shared_ptr info_{new NiceMock()}; }; @@ -81,7 +82,8 @@ class RoundRobinTester : public BaseTester { void initialize() { lb_ = std::make_unique(priority_set_, &local_priority_set_, stats_, - runtime_, random_, common_config_); + runtime_, random_, common_config_, + round_robin_lb_config_, simTime()); } std::unique_ptr lb_; @@ -92,9 +94,9 @@ class LeastRequestTester : public BaseTester { LeastRequestTester(uint64_t num_hosts, uint32_t choice_count) : BaseTester(num_hosts) { envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; lr_lb_config.mutable_choice_count()->set_value(choice_count); - lb_ = - std::make_unique(priority_set_, &local_priority_set_, stats_, - runtime_, random_, common_config_, lr_lb_config); + lb_ = std::make_unique(priority_set_, &local_priority_set_, stats_, + runtime_, random_, common_config_, + lr_lb_config, simTime()); } std::unique_ptr lb_; @@ -541,10 +543,10 @@ class SubsetLbTester : public BaseTester { *selector->mutable_keys()->Add() = std::string(metadata_key); subset_info_ = std::make_unique(subset_config); - lb_ = std::make_unique(LoadBalancerType::Random, priority_set_, - &local_priority_set_, stats_, stats_store_, runtime_, - random_, *subset_info_, absl::nullopt, absl::nullopt, - absl::nullopt, common_config_); + lb_ = std::make_unique( + LoadBalancerType::Random, priority_set_, &local_priority_set_, stats_, stats_store_, + runtime_, random_, *subset_info_, absl::nullopt, absl::nullopt, absl::nullopt, + absl::nullopt, common_config_, simTime()); const HostVector& hosts = priority_set_.getOrCreateHostSet(0).hosts(); ASSERT(hosts.size() == num_hosts); diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index dea39058ef38..78e6d1f74691 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -32,6 +32,19 @@ using testing::ReturnRef; namespace Envoy { namespace Upstream { + +class EdfLoadBalancerBasePeer { +public: + static const std::chrono::milliseconds& slowStartWindow(EdfLoadBalancerBase& edf_lb) { + return edf_lb.slow_start_window_; + } + static double aggression(EdfLoadBalancerBase& edf_lb) { return edf_lb.aggression_; } + static const std::chrono::milliseconds latestHostAddedTime(EdfLoadBalancerBase& edf_lb) { + return std::chrono::time_point_cast(edf_lb.latest_host_added_time_) + .time_since_epoch(); + } +}; + namespace { static constexpr uint32_t UnhealthyStatus = 1u << static_cast(Host::Health::Unhealthy); @@ -62,6 +75,7 @@ class LoadBalancerTestBase : public Event::TestUsingSimulatedTime, std::shared_ptr info_{new NiceMock()}; envoy::config::cluster::v3::Cluster::CommonLbConfig common_config_; envoy::config::cluster::v3::Cluster::LeastRequestLbConfig least_request_lb_config_; + envoy::config::cluster::v3::Cluster::RoundRobinLbConfig round_robin_lb_config_; }; class TestLb : public LoadBalancerBase { @@ -232,8 +246,8 @@ TEST_P(LoadBalancerBaseTest, PrioritySelectionFuzz) { const auto hs = lb_.chooseHostSet(&context, 0); switch (hs.second) { case LoadBalancerBase::HostAvailability::Healthy: - // Either we selected one of the healthy hosts or we failed to select anything and defaulted - // to healthy. + // Either we selected one of the healthy hosts or we failed to select anything and + // defaulted to healthy. EXPECT_TRUE(!hs.first.healthyHosts().empty() || (hs.first.healthyHosts().empty() && hs.first.degradedHosts().empty())); break; @@ -319,7 +333,9 @@ TEST_P(LoadBalancerBaseTest, GentleFailover) { // Health P=0 == 100*1.4 == 35 P=1 == 35 // Since 3 hosts are excluded, P=0 should be considered fully healthy. // Total health = 100% + 35% is greater than 100%. Panic should not trigger. - updateHostSet(host_set_, 4 /* num_hosts */, 1 /* num_healthy_hosts */, 0 /* num_degraded_hosts */, + updateHostSet(host_set_, 4 /* num_hosts */, 1 /* num_healthy_hosts */, 0 /* num_degraded_hosts + */ + , 3 /* num_excluded_hosts */); updateHostSet(failover_host_set_, 5 /* num_hosts */, 1 /* num_healthy_hosts */); ASSERT_THAT(getLoadPercentage(), ElementsAre(100, 0)); @@ -330,7 +346,9 @@ TEST_P(LoadBalancerBaseTest, GentleFailover) { // All priorities are in panic mode (situation called TotalPanic) // Load is distributed based on number of hosts regardless of their health status. // P=0 and P=1 have 4 hosts each so each priority will receive 50% of the traffic. - updateHostSet(host_set_, 4 /* num_hosts */, 0 /* num_healthy_hosts */, 0 /* num_degraded_hosts */, + updateHostSet(host_set_, 4 /* num_hosts */, 0 /* num_healthy_hosts */, 0 /* num_degraded_hosts + */ + , 4 /* num_excluded_hosts */); updateHostSet(failover_host_set_, 4 /* num_hosts */, 1 /* num_healthy_hosts */); ASSERT_THAT(getLoadPercentage(), ElementsAre(50, 50)); @@ -342,7 +360,9 @@ TEST_P(LoadBalancerBaseTest, GentleFailover) { // P=0 has 4 hosts with 1 excluded, P=1 has 6 hosts with 2 excluded. // P=0 should receive 4/(4+6)=40% of traffic // P=1 should receive 6/(4+6)=60% of traffic - updateHostSet(host_set_, 4 /* num_hosts */, 0 /* num_healthy_hosts */, 0 /* num_degraded_hosts */, + updateHostSet(host_set_, 4 /* num_hosts */, 0 /* num_healthy_hosts */, 0 /* num_degraded_hosts + */ + , 1 /* num_excluded_hosts */); updateHostSet(failover_host_set_, 6 /* num_hosts */, 1 /* num_healthy_hosts */, 0 /* num_degraded_hosts */, 2 /* num_excluded_hosts */); @@ -646,7 +666,8 @@ class RoundRobinLoadBalancerTest : public LoadBalancerTestBase { local_priority_set_->getOrCreateHostSet(0); } lb_ = std::make_shared(priority_set_, local_priority_set_.get(), stats_, - runtime_, random_, common_config_); + runtime_, random_, common_config_, + round_robin_lb_config_, simTime()); } // Updates priority 0 with the given hosts and hosts_per_locality. @@ -1375,8 +1396,8 @@ TEST_P(RoundRobinLoadBalancerTest, LowPrecisionForDistribution) { // The following host distribution with current precision should lead to the no_capacity_left // situation. - // Reuse the same host in all of the structures below to reduce time test takes and this does not - // impact load balancing logic. + // Reuse the same host in all of the structures below to reduce time test takes and this does + // not impact load balancing logic. HostSharedPtr host = makeTestHost(info_, "tcp://127.0.0.1:80", simTime()); HostVector current(45000); @@ -1555,10 +1576,302 @@ TEST_P(RoundRobinLoadBalancerTest, NoZoneAwareRoutingNoLocalLocality) { INSTANTIATE_TEST_SUITE_P(PrimaryOrFailover, RoundRobinLoadBalancerTest, ::testing::Values(true, false)); +TEST_P(RoundRobinLoadBalancerTest, SlowStartWithDefaultParams) { + init(false); + const auto slow_start_window = + EdfLoadBalancerBasePeer::slowStartWindow(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(0), slow_start_window); + const auto aggression = + EdfLoadBalancerBasePeer::aggression(static_cast(*lb_)); + EXPECT_EQ(1.0, aggression); + const auto latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(0), latest_host_added_time); +} + +TEST_P(RoundRobinLoadBalancerTest, SlowStartNoWait) { + round_robin_lb_config_.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(60); + simTime().advanceTimeWait(std::chrono::seconds(1)); + auto host1 = makeTestHost(info_, "tcp://127.0.0.1:80", simTime()); + host_set_.hosts_ = {host1}; + + init(true); + + // As no healthcheck is configured, hosts would enter slow start immediately. + HostVector empty; + HostVector hosts_added; + hosts_added.push_back(host1); + simTime().advanceTimeWait(std::chrono::seconds(5)); + hostSet().runCallbacks(hosts_added, empty); + auto latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(1000), latest_host_added_time_ms); + + // Advance time, so that host is no longer in slow start. + simTime().advanceTimeWait(std::chrono::seconds(56)); + + hosts_added.clear(); + auto host2 = makeTestHost(info_, "tcp://127.0.0.1:90", simTime()); + + hosts_added.push_back(host2); + + hostSet().healthy_hosts_ = {host1, host2}; + hostSet().hosts_ = hostSet().healthy_hosts_; + hostSet().runCallbacks(hosts_added, empty); + + latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(62000), latest_host_added_time_ms); + + // host2 is 12 secs in slow start, the weight is scaled with time factor 12 / 60 == 0.2. + simTime().advanceTimeWait(std::chrono::seconds(12)); + + // Recalculate weights. + hostSet().runCallbacks(empty, empty); + + // We expect 4:1 ratio, as host2 is in slow start mode and it's weight is scaled with + // 0.2 factor. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + + // host2 is 20 secs in slow start, the weight is scaled with time factor 20 / 60 == 0.33. + simTime().advanceTimeWait(std::chrono::seconds(8)); + + // Recalculate weights. + hostSet().runCallbacks(empty, empty); + + // We expect 2:1 ratio, as host2 is in slow start mode and it's weight is scaled with + // 0.33 factor. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + + // Advance time, so that there are no hosts in slow start. + simTime().advanceTimeWait(std::chrono::seconds(45)); + + // Recalculate weights. + hostSet().runCallbacks(empty, empty); + + // Now expect 1:1 ratio. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); +} + +TEST_P(RoundRobinLoadBalancerTest, SlowStartWaitForPassingHC) { + round_robin_lb_config_.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(10); + simTime().advanceTimeWait(std::chrono::seconds(1)); + auto host1 = makeTestHost(info_, "tcp://127.0.0.1:80", simTime()); + host1->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + + host_set_.hosts_ = {host1}; + + init(true); + + HostVector empty; + HostVector hosts_added; + hosts_added.push_back(host1); + simTime().advanceTimeWait(std::chrono::seconds(1)); + hostSet().runCallbacks(hosts_added, empty); + auto latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(1000), latest_host_added_time_ms); + + simTime().advanceTimeWait(std::chrono::seconds(5)); + + hosts_added.clear(); + auto host2 = makeTestHost(info_, "tcp://127.0.0.1:90", simTime()); + hosts_added.push_back(host2); + + hostSet().hosts_ = {host1, host2}; + hostSet().runCallbacks(hosts_added, empty); + + // As host1 has not passed first HC, it should not enter slow start mode. + latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(7000), latest_host_added_time_ms); + + simTime().advanceTimeWait(std::chrono::seconds(1)); + host1->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + hostSet().healthy_hosts_ = {host1, host2}; + // Trigger callbacks to add host1 to slow start mode. + hostSet().runCallbacks({}, {}); + + simTime().advanceTimeWait(std::chrono::seconds(1)); + host1->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + // Trigger callbacks to remove host1 from slow start mode. + hostSet().runCallbacks({}, {}); + simTime().advanceTimeWait(std::chrono::seconds(4)); + // Trigger callbacks to remove host1 from slow start mode. + hostSet().runCallbacks({}, {}); + + // We expect 3:1 ratio, as host2 is in slow start mode, its weight is scaled with time factor + // 5 / 10 == 0.5. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + + // Advance time, so there are no hosts in slow start. + simTime().advanceTimeWait(std::chrono::seconds(20)); + hostSet().runCallbacks({}, {}); + + // We expect 1:1 ratio, as there are no hosts in slow start mode. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); +} + +TEST_P(RoundRobinLoadBalancerTest, SlowStartWithRuntimeAggression) { + round_robin_lb_config_.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(10); + round_robin_lb_config_.mutable_slow_start_config()->mutable_aggression()->set_runtime_key( + "aggression"); + round_robin_lb_config_.mutable_slow_start_config()->mutable_aggression()->set_default_value(1.0); + + init(true); + EXPECT_CALL(runtime_.snapshot_, getDouble("aggression", 1.0)).WillRepeatedly(Return(1.0)); + + simTime().advanceTimeWait(std::chrono::seconds(1)); + + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80", simTime(), 1), + makeTestHost(info_, "tcp://127.0.0.1:90", simTime(), 1), + makeTestHost(info_, "tcp://127.0.0.1:100", simTime(), 1)}; + + hostSet().hosts_ = hostSet().healthy_hosts_; + hostSet().runCallbacks({}, {}); + + simTime().advanceTimeWait(std::chrono::seconds(5)); + hostSet().healthy_hosts_[0]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + hostSet().runCallbacks({}, {}); + + auto latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(1000), latest_host_added_time_ms); + + // We should see 2:1:1 ratio, as hosts 2 and 3 are in slow start, their weights are scaled with + // 0.5 factor. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[2], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + + simTime().advanceTimeWait(std::chrono::seconds(4)); + HostVector hosts_added; + auto host4 = makeTestHost(info_, "tcp://127.0.0.1:110", simTime()); + hostSet().hosts_.push_back(host4); + hostSet().healthy_hosts_.push_back(host4); + EXPECT_CALL(runtime_.snapshot_, getDouble("aggression", 1.0)).WillRepeatedly(Return(1.5)); + // Recompute edf schedulers. + hostSet().runCallbacks(hosts_added, {}); + + latest_host_added_time_ms = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(*lb_)); + EXPECT_EQ(std::chrono::milliseconds(10000), latest_host_added_time_ms); + + // We should see 1:1:1:0 ratio, as host 2 and 3 weight is scaled with (9/10)^(1/1.5)=0.93 factor, + // host4 weight is 0.002. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[2], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[2], lb_->chooseHost(nullptr)); + + // host4 is 9 seconds in slow start, it's weight is scaled with (9/10)^(1/1.5)=0.93 factor. + simTime().advanceTimeWait(std::chrono::seconds(9)); + hostSet().runCallbacks({}, {}); + + // We should see 1:1:1:1 ratio, only host4 is in slow start with weight 0.93, and the rest of + // hosts are outside of slow start with weight 1. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[2], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[3], lb_->chooseHost(nullptr)); +} + +TEST_P(RoundRobinLoadBalancerTest, SlowStartNoWaitNonLinearAggression) { + round_robin_lb_config_.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(60); + round_robin_lb_config_.mutable_slow_start_config()->mutable_aggression()->set_runtime_key( + "aggression"); + round_robin_lb_config_.mutable_slow_start_config()->mutable_aggression()->set_default_value(2.0); + simTime().advanceTimeWait(std::chrono::seconds(1)); + + init(true); + + // As no healthcheck is configured, hosts would enter slow start immediately. + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80", simTime())}; + hostSet().hosts_ = hostSet().healthy_hosts_; + simTime().advanceTimeWait(std::chrono::seconds(5)); + // Host1 is 5 secs in slow start, its weight is scaled with (0.5/60)^(1/2)=0.28 factor. + hostSet().runCallbacks({}, {}); + + // Advance time, so that host1 is no longer in slow start. + simTime().advanceTimeWait(std::chrono::seconds(56)); + + HostVector hosts_added; + auto host2 = makeTestHost(info_, "tcp://127.0.0.1:90", simTime()); + + hosts_added.push_back(host2); + + hostSet().healthy_hosts_.push_back(host2); + hostSet().hosts_ = hostSet().healthy_hosts_; + // host2 weight is scaled with 0.004 factor. + hostSet().runCallbacks(hosts_added, {}); + + // host2 is 6 secs in slow start. + simTime().advanceTimeWait(std::chrono::seconds(6)); + + // Recalculate weights. + hostSet().runCallbacks({}, {}); + + // We expect 3:1 ratio, as host2 is 6 secs in slow start mode and it's weight is scaled with + // pow(0.1, 0.5)==0.31 factor. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + + // host2 is 26 secs in slow start. + simTime().advanceTimeWait(std::chrono::seconds(20)); + + // Recalculate weights. + hostSet().runCallbacks({}, {}); + + // We still expect 5:3 ratio, as host2 is in slow start mode and it's weight is scaled with + // pow(0.43, 0.5)==0.65 factor. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + + // Advance time, so that there are no hosts in slow start. + simTime().advanceTimeWait(std::chrono::seconds(41)); + + // Recalculate weights. + hostSet().runCallbacks({}, {}); + + // Now expect 1:1 ratio. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); +} + class LeastRequestLoadBalancerTest : public LoadBalancerTestBase { public: LeastRequestLoadBalancer lb_{ - priority_set_, nullptr, stats_, runtime_, random_, common_config_, least_request_lb_config_}; + priority_set_, nullptr, stats_, runtime_, random_, common_config_, least_request_lb_config_, + simTime()}; }; TEST_P(LeastRequestLoadBalancerTest, NoHosts) { EXPECT_EQ(nullptr, lb_.chooseHost(nullptr)); } @@ -1635,11 +1948,11 @@ TEST_P(LeastRequestLoadBalancerTest, PNC) { // Creating various load balancer objects with different choice configs. envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; lr_lb_config.mutable_choice_count()->set_value(2); - LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, - random_, common_config_, lr_lb_config}; + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; lr_lb_config.mutable_choice_count()->set_value(5); - LeastRequestLoadBalancer lb_5{priority_set_, nullptr, stats_, runtime_, - random_, common_config_, lr_lb_config}; + LeastRequestLoadBalancer lb_5{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; // Verify correct number of choices. @@ -1715,8 +2028,8 @@ TEST_P(LeastRequestLoadBalancerTest, WeightImbalanceWithInvalidActiveRequestBias envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; lr_lb_config.mutable_active_request_bias()->set_runtime_key("ar_bias"); lr_lb_config.mutable_active_request_bias()->set_default_value(1.0); - LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, - random_, common_config_, lr_lb_config}; + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; EXPECT_CALL(runtime_.snapshot_, getDouble("ar_bias", 1.0)).WillRepeatedly(Return(-1.0)); @@ -1769,8 +2082,8 @@ TEST_P(LeastRequestLoadBalancerTest, WeightImbalanceWithCustomActiveRequestBias) envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; lr_lb_config.mutable_active_request_bias()->set_runtime_key("ar_bias"); lr_lb_config.mutable_active_request_bias()->set_default_value(1.0); - LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, - random_, common_config_, lr_lb_config}; + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; EXPECT_CALL(runtime_.snapshot_, getDouble("ar_bias", 1.0)).WillRepeatedly(Return(0.0)); @@ -1815,6 +2128,197 @@ TEST_P(LeastRequestLoadBalancerTest, WeightImbalanceCallbacks) { EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr)); } +TEST_P(LeastRequestLoadBalancerTest, SlowStartWithDefaultParams) { + envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; + const auto slow_start_window = + EdfLoadBalancerBasePeer::slowStartWindow(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(0), slow_start_window); + const auto aggression = + EdfLoadBalancerBasePeer::aggression(static_cast(lb_2)); + EXPECT_EQ(1.0, aggression); + const auto latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(0), latest_host_added_time); +} + +TEST_P(LeastRequestLoadBalancerTest, SlowStartNoWait) { + envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; + lr_lb_config.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(60); + lr_lb_config.mutable_active_request_bias()->set_runtime_key("ar_bias"); + lr_lb_config.mutable_active_request_bias()->set_default_value(1.0); + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; + simTime().advanceTimeWait(std::chrono::seconds(1)); + + // As no healthcheck is configured, hosts would enter slow start immediately. + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80", simTime())}; + hostSet().hosts_ = hostSet().healthy_hosts_; + simTime().advanceTimeWait(std::chrono::seconds(5)); + // Host1 is 5 secs in slow start, its weight is scaled with (5/60)^1=0.08 factor. + hostSet().runCallbacks({}, {}); + + auto latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(1000), latest_host_added_time); + + // Advance time, so that host is no longer in slow start. + simTime().advanceTimeWait(std::chrono::seconds(56)); + + auto host2 = makeTestHost(info_, "tcp://127.0.0.1:90", simTime()); + hostSet().healthy_hosts_.push_back(host2); + hostSet().hosts_ = hostSet().healthy_hosts_; + HostVector hosts_added; + hosts_added.push_back(host2); + + hostSet().runCallbacks(hosts_added, {}); + + latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(62000), latest_host_added_time); + + // host2 is 20 secs in slow start, the weight is scaled with time factor 20 / 60 == 0.16. + simTime().advanceTimeWait(std::chrono::seconds(10)); + + // Recalculate weights. + hostSet().runCallbacks({}, {}); + + hostSet().healthy_hosts_[0]->stats().rq_active_.set(1); + hostSet().healthy_hosts_[1]->stats().rq_active_.set(0); + + // We expect 3:1 ratio, as host2 is in slow start mode and it's weight is scaled with + // 0.16 factor and host1 weight with 0.5 factor (due to active request bias). + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + + // host2 is 50 secs in slow start, the weight is scaled with time factor 40 / 60 == 0.66. + simTime().advanceTimeWait(std::chrono::seconds(30)); + + // Recalculate weights. + hostSet().runCallbacks({}, {}); + + // We expect 4:3 ratio, as host2 is in slow start mode and it's weight is scaled with + // 0.66 factor and host1 weight with 0.5 factor. + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); +} + +TEST_P(LeastRequestLoadBalancerTest, SlowStartWaitForPassingHC) { + envoy::config::cluster::v3::Cluster::LeastRequestLbConfig lr_lb_config; + lr_lb_config.mutable_slow_start_config()->mutable_slow_start_window()->set_seconds(10); + lr_lb_config.mutable_slow_start_config()->mutable_aggression()->set_runtime_key("aggression"); + lr_lb_config.mutable_slow_start_config()->mutable_aggression()->set_default_value(0.9); + lr_lb_config.mutable_active_request_bias()->set_runtime_key("ar_bias"); + lr_lb_config.mutable_active_request_bias()->set_default_value(0.9); + + LeastRequestLoadBalancer lb_2{priority_set_, nullptr, stats_, runtime_, + random_, common_config_, lr_lb_config, simTime()}; + + simTime().advanceTimeWait(std::chrono::seconds(1)); + auto host1 = makeTestHost(info_, "tcp://127.0.0.1:80", simTime()); + host1->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + + host_set_.hosts_ = {host1}; + + HostVector hosts_added; + hosts_added.push_back(host1); + simTime().advanceTimeWait(std::chrono::seconds(1)); + hostSet().runCallbacks(hosts_added, {}); + + auto latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(0), latest_host_added_time); + + simTime().advanceTimeWait(std::chrono::seconds(5)); + + hosts_added.clear(); + auto host2 = makeTestHost(info_, "tcp://127.0.0.1:90", simTime()); + hosts_added.push_back(host2); + + hostSet().healthy_hosts_ = {host1, host2}; + hostSet().hosts_ = hostSet().healthyHosts(); + hostSet().runCallbacks(hosts_added, {}); + + latest_host_added_time = + EdfLoadBalancerBasePeer::latestHostAddedTime(static_cast(lb_2)); + EXPECT_EQ(std::chrono::milliseconds(7000), latest_host_added_time); + + simTime().advanceTimeWait(std::chrono::seconds(1)); + host1->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + hostSet().healthy_hosts_ = {host1, host2}; + + hostSet().healthy_hosts_[0]->stats().rq_active_.set(1); + hostSet().healthy_hosts_[1]->stats().rq_active_.set(0); + + hostSet().healthy_hosts_ = {host1, host2}; + hostSet().hosts_ = hostSet().healthyHosts(); + + // Trigger callbacks to add host1 to slow start mode. + hostSet().runCallbacks({}, {}); + + // We expect 11:2 ratio, as host2 is in slow start mode, its weight is scaled with factor + // pow(0.1, 1.11)=0.07. Host1 is 7 seconds in slow start and its weight is scaled with active + // request and time bias 0.53 * pow(0.7, 1.11) = 0.36. + + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + + simTime().advanceTimeWait(std::chrono::seconds(3)); + host1->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + // Trigger callbacks to remove host1 from slow start mode. + hostSet().runCallbacks({}, {}); + + // We expect 3:5 ratio, as host2 is 4 seconds in slow start, its weight is scaled with factor + // pow(0.4, 1.11)=0.36. Host1 is not in slow start and its weight is scaled with active + // request bias = 0.53. + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + + // Host2 is 7 seconds in slow start, the weight is scaled with time factor 7 / 10 == 0.6. + simTime().advanceTimeWait(std::chrono::seconds(3)); + + hostSet().runCallbacks({}, {}); + + // We expect 6:5 ratio, as host2 is in slow start mode, its weight is scaled with time factor + // pow(0.7, 1.11)=0.67. Host1 weight is scaled with active request bias = 0.53. + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_2.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_2.chooseHost(nullptr)); +} + INSTANTIATE_TEST_SUITE_P(PrimaryOrFailover, LeastRequestLoadBalancerTest, ::testing::Values(true, false)); diff --git a/test/common/upstream/load_balancer_simulation_test.cc b/test/common/upstream/load_balancer_simulation_test.cc index 28ded32dd029..22d081562f1a 100644 --- a/test/common/upstream/load_balancer_simulation_test.cc +++ b/test/common/upstream/load_balancer_simulation_test.cc @@ -74,11 +74,13 @@ TEST(DISABLED_LeastRequestLoadBalancerWeightTest, Weight) { ClusterStats stats{ClusterInfoImpl::generateStats(stats_store, stat_names)}; stats.max_host_weight_.set(weight); NiceMock runtime; + auto time_source = std::make_unique>(); Random::RandomGeneratorImpl random; envoy::config::cluster::v3::Cluster::LeastRequestLbConfig least_request_lb_config; envoy::config::cluster::v3::Cluster::CommonLbConfig common_config; LeastRequestLoadBalancer lb_{ - priority_set, nullptr, stats, runtime, random, common_config, least_request_lb_config}; + priority_set, nullptr, stats, runtime, random, common_config, least_request_lb_config, + *time_source}; absl::node_hash_map host_hits; const uint64_t total_requests = 100; diff --git a/test/common/upstream/round_robin_load_balancer_fuzz.proto b/test/common/upstream/round_robin_load_balancer_fuzz.proto index a5ecf67ccc1c..60da8d643768 100644 --- a/test/common/upstream/round_robin_load_balancer_fuzz.proto +++ b/test/common/upstream/round_robin_load_balancer_fuzz.proto @@ -4,9 +4,11 @@ syntax = "proto3"; package test.common.upstream; import "validate/validate.proto"; +import "envoy/config/cluster/v3/cluster.proto"; import "test/common/upstream/zone_aware_load_balancer_fuzz.proto"; message RoundRobinLoadBalancerTestCase { test.common.upstream.ZoneAwareLoadBalancerTestCase zone_aware_load_balancer_test_case = 1 [(validate.rules).message.required = true]; + envoy.config.cluster.v3.Cluster.RoundRobinLbConfig round_robin_lb_config = 2; } diff --git a/test/common/upstream/round_robin_load_balancer_fuzz_test.cc b/test/common/upstream/round_robin_load_balancer_fuzz_test.cc index 4c1809a9a223..75a456f44c87 100644 --- a/test/common/upstream/round_robin_load_balancer_fuzz_test.cc +++ b/test/common/upstream/round_robin_load_balancer_fuzz_test.cc @@ -31,7 +31,8 @@ DEFINE_PROTO_FUZZER(const test::common::upstream::RoundRobinLoadBalancerTestCase zone_aware_load_balancer_fuzz.local_priority_set_.get(), zone_aware_load_balancer_fuzz.stats_, zone_aware_load_balancer_fuzz.runtime_, zone_aware_load_balancer_fuzz.random_, - zone_aware_load_balancer_test_case.load_balancer_test_case().common_lb_config()); + zone_aware_load_balancer_test_case.load_balancer_test_case().common_lb_config(), + input.round_robin_lb_config(), zone_aware_load_balancer_fuzz.simTime()); } catch (EnvoyException& e) { ENVOY_LOG_MISC(debug, "EnvoyException; {}", e.what()); return; diff --git a/test/common/upstream/subset_lb_test.cc b/test/common/upstream/subset_lb_test.cc index 169202a44803..17952c30e5f7 100644 --- a/test/common/upstream/subset_lb_test.cc +++ b/test/common/upstream/subset_lb_test.cc @@ -201,7 +201,8 @@ class SubsetLoadBalancerTest : public Event::TestUsingSimulatedTime, lb_ = std::make_shared( lb_type_, priority_set_, nullptr, stats_, *scope_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + ring_hash_lb_config_, maglev_lb_config_, round_robin_lb_config_, least_request_lb_config_, + common_config_, simTime()); } void zoneAwareInit(const std::vector& host_metadata_per_locality, @@ -248,10 +249,10 @@ class SubsetLoadBalancerTest : public Event::TestUsingSimulatedTime, std::make_shared(), HostsPerLocalityImpl::empty()), {}, {}, {}, absl::nullopt); - lb_ = std::make_shared(lb_type_, priority_set_, &local_priority_set_, - stats_, *scope_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, - least_request_lb_config_, common_config_); + lb_ = std::make_shared( + lb_type_, priority_set_, &local_priority_set_, stats_, *scope_, runtime_, random_, + subset_info_, ring_hash_lb_config_, maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); } HostSharedPtr makeHost(const std::string& url, const HostMetadata& metadata) { @@ -475,6 +476,7 @@ class SubsetLoadBalancerTest : public Event::TestUsingSimulatedTime, envoy::config::cluster::v3::Cluster::RingHashLbConfig ring_hash_lb_config_; envoy::config::cluster::v3::Cluster::MaglevLbConfig maglev_lb_config_; envoy::config::cluster::v3::Cluster::LeastRequestLbConfig least_request_lb_config_; + envoy::config::cluster::v3::Cluster::RoundRobinLbConfig round_robin_lb_config_; envoy::config::cluster::v3::Cluster::CommonLbConfig common_config_; NiceMock runtime_; NiceMock random_; @@ -1458,9 +1460,10 @@ TEST_F(SubsetLoadBalancerTest, IgnoresHostsWithoutMetadata) { host_set_.healthy_hosts_ = host_set_.hosts_; host_set_.healthy_hosts_per_locality_ = host_set_.hosts_per_locality_; - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); TestLoadBalancerContext context_version({{"version", "1.0"}}); @@ -1877,9 +1880,10 @@ TEST_F(SubsetLoadBalancerTest, DisabledLocalityWeightAwareness) { }, host_set_, {1, 100}); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); TestLoadBalancerContext context({{"version", "1.1"}}); @@ -1900,9 +1904,10 @@ TEST_F(SubsetLoadBalancerTest, DoesNotCheckHostHealth) { EXPECT_CALL(*mock_host, weight()).WillRepeatedly(Return(1)); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); } TEST_F(SubsetLoadBalancerTest, EnabledLocalityWeightAwareness) { @@ -1923,9 +1928,10 @@ TEST_F(SubsetLoadBalancerTest, EnabledLocalityWeightAwareness) { }, host_set_, {1, 100}); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); TestLoadBalancerContext context({{"version", "1.1"}}); @@ -1958,9 +1964,10 @@ TEST_F(SubsetLoadBalancerTest, EnabledScaleLocalityWeights) { }, host_set_, {50, 50}); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); TestLoadBalancerContext context({{"version", "1.1"}}); // Since we scale the locality weights by number of hosts removed, we expect to see the second @@ -2003,9 +2010,10 @@ TEST_F(SubsetLoadBalancerTest, EnabledScaleLocalityWeightsRounding) { }, host_set_, {2, 2}); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); TestLoadBalancerContext context({{"version", "1.0"}}); // We expect to see a 33/66 split because 2 * 1 / 2 = 1 and 2 * 3 / 4 = 1.5 -> 2 @@ -2035,9 +2043,10 @@ TEST_F(SubsetLoadBalancerTest, ScaleLocalityWeightsWithNoLocalityWeights) { }, host_set_); - lb_ = std::make_shared( - lb_type_, priority_set_, nullptr, stats_, stats_store_, runtime_, random_, subset_info_, - ring_hash_lb_config_, maglev_lb_config_, least_request_lb_config_, common_config_); + lb_ = std::make_shared(lb_type_, priority_set_, nullptr, stats_, stats_store_, + runtime_, random_, subset_info_, ring_hash_lb_config_, + maglev_lb_config_, round_robin_lb_config_, + least_request_lb_config_, common_config_, simTime()); } TEST_P(SubsetLoadBalancerTest, GaugesUpdatedOnDestroy) { diff --git a/test/common/upstream/zone_aware_load_balancer_fuzz_base.h b/test/common/upstream/zone_aware_load_balancer_fuzz_base.h index be4a9ecb9a05..9e455027f312 100644 --- a/test/common/upstream/zone_aware_load_balancer_fuzz_base.h +++ b/test/common/upstream/zone_aware_load_balancer_fuzz_base.h @@ -1,12 +1,14 @@ #pragma once #include "test/mocks/upstream/priority_set.h" +#include "test/test_common/simulated_time_system.h" #include "load_balancer_fuzz_base.h" namespace Envoy { namespace Upstream { -class ZoneAwareLoadBalancerFuzzBase : public LoadBalancerFuzzBase { +class ZoneAwareLoadBalancerFuzzBase : public Event::TestUsingSimulatedTime, + public LoadBalancerFuzzBase { public: ZoneAwareLoadBalancerFuzzBase(bool need_local_cluster, const std::string& random_bytestring) : random_bytestring_(random_bytestring) { diff --git a/test/integration/stats_integration_test.cc b/test/integration/stats_integration_test.cc index ffe0dcbc742c..14b2b8f61bf7 100644 --- a/test/integration/stats_integration_test.cc +++ b/test/integration/stats_integration_test.cc @@ -270,6 +270,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSize) { // 2020/10/02 13251 39326 switch to google tcmalloc // 2021/08/15 17290 40349 add all host map to priority set for fast host // searching + // 2021/08/18 13176 40577 40700 Support slow start mode // Note: when adjusting this value: EXPECT_MEMORY_EQ is active only in CI // 'release' builds, where we control the platform and tool-chain. So you @@ -290,7 +291,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSize) { // https://github.com/envoyproxy/envoy/issues/12209 // EXPECT_MEMORY_EQ(m_per_cluster, 37061); } - EXPECT_MEMORY_LE(m_per_cluster, 40350); // Round up to allow platform variations. + EXPECT_MEMORY_LE(m_per_cluster, 40700); // Round up to allow platform variations. } TEST_P(ClusterMemoryTestRunner, MemoryLargeHostSizeWithStats) { diff --git a/test/mocks/upstream/cluster_info.cc b/test/mocks/upstream/cluster_info.cc index c24c81d95bc2..75b0f629e68f 100644 --- a/test/mocks/upstream/cluster_info.cc +++ b/test/mocks/upstream/cluster_info.cc @@ -99,6 +99,7 @@ MockClusterInfo::MockClusterInfo() ON_CALL(*this, lbType()).WillByDefault(ReturnPointee(&lb_type_)); ON_CALL(*this, sourceAddress()).WillByDefault(ReturnRef(source_address_)); ON_CALL(*this, lbSubsetInfo()).WillByDefault(ReturnRef(lb_subset_)); + ON_CALL(*this, lbRoundRobinConfig()).WillByDefault(ReturnRef(lb_round_robin_config_)); ON_CALL(*this, lbRingHashConfig()).WillByDefault(ReturnRef(lb_ring_hash_config_)); ON_CALL(*this, lbMaglevConfig()).WillByDefault(ReturnRef(lb_maglev_config_)); ON_CALL(*this, lbOriginalDstConfig()).WillByDefault(ReturnRef(lb_original_dst_config_)); diff --git a/test/mocks/upstream/cluster_info.h b/test/mocks/upstream/cluster_info.h index 5e5415f88472..05846734280b 100644 --- a/test/mocks/upstream/cluster_info.h +++ b/test/mocks/upstream/cluster_info.h @@ -117,6 +117,8 @@ class MockClusterInfo : public ClusterInfo { lbRingHashConfig, (), (const)); MOCK_METHOD(const absl::optional&, lbMaglevConfig, (), (const)); + MOCK_METHOD(const absl::optional&, + lbRoundRobinConfig, (), (const)); MOCK_METHOD(const absl::optional&, lbLeastRequestConfig, (), (const)); MOCK_METHOD(const absl::optional&, @@ -194,6 +196,7 @@ class MockClusterInfo : public ClusterInfo { upstream_http_protocol_options_; absl::optional alternate_protocols_cache_options_; + absl::optional lb_round_robin_config_; absl::optional lb_ring_hash_config_; absl::optional lb_maglev_config_; absl::optional lb_original_dst_config_;