diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index d03cd512893b..a998870d2a03 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -35,16 +35,6 @@ upstream.weight_enabled upstream.zone_routing.enabled % of requests that will be routed to the same upstream zone. Defaults to 100% of requests. -upstream.zone_routing.percent_diff - Zone aware routing will be used only if the percent of upstream hosts in the same zone is within - percent_diff of expected. Expected is calculated as 100 / number_of_zones. This prevents Envoy - from using same zone routing if the zones are not balanced well. Defaults to 3% allowed deviation. - -upstream.zone_routing.healthy_panic_threshold - Defines the :ref:`zone healthy panic threshold ` - percentage. Defaults to 80%. If the % of healthy hosts in the current zone falls below this % - all healthy hosts will be used for routing. - circuit_breakers...max_connections :ref:`Max connections circuit breaker setting ` diff --git a/docs/intro/arch_overview/load_balancing.rst b/docs/intro/arch_overview/load_balancing.rst index 998a85b4697b..954178130f76 100644 --- a/docs/intro/arch_overview/load_balancing.rst +++ b/docs/intro/arch_overview/load_balancing.rst @@ -57,6 +57,5 @@ Zone aware routing and local zone panic threshold ------------------------------------------------- By default Envoy performs zone aware routing where it will send traffic to the same upstream zone. -This is only done if the zones are well balanced (defaults to 3% allowed deviation) and if there -are enough healthy hosts in the local zone (the *panic threshold* which defaults to 80%). These are -:ref:`configurable ` via runtime. +This feature is in active development. +You can enable/disable it :ref:`via runtime `. diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 50192c8d2e06..24f9ac074aae 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -177,11 +177,12 @@ class HostSet { COUNTER(update_attempt) \ COUNTER(update_success) \ COUNTER(update_failure) \ + COUNTER(zone_cluster_too_small) \ + COUNTER(zone_over_percentage) \ + COUNTER(zone_routing_sampled) \ + COUNTER(zone_routing_no_sampled) \ GAUGE (max_host_weight) \ - GAUGE (upstream_zone_count) \ - COUNTER(upstream_zone_above_threshold) \ - COUNTER(upstream_zone_healthy_panic) \ - COUNTER(upstream_zone_within_threshold) + GAUGE (upstream_zone_count) // clang-format on /** diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index a5284080c6d4..1e3452f9a05e 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -295,7 +295,7 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::ClusterEntry( break; } case LoadBalancerType::RoundRobin: { - lb_.reset(new RoundRobinLoadBalancer(host_set_, cluster.stats(), runtime)); + lb_.reset(new RoundRobinLoadBalancer(host_set_, cluster.stats(), runtime, random)); break; } } diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index c03e3c0daf06..8fed50577334 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -24,39 +24,50 @@ const std::vector& LoadBalancerBase::hostsToUse() { return host_set_.hosts(); } + uint32_t number_of_zones = stats_.upstream_zone_count_.value(); // Early exit if we cannot perform zone aware routing. - if (stats_.upstream_zone_count_.value() < 2 || host_set_.localZoneHealthyHosts().empty() || + if (number_of_zones < 2 || host_set_.localZoneHealthyHosts().empty() || !runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 100)) { return host_set_.healthyHosts(); } - double zone_to_all_percent = - 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); - double expected_percent = 100.0 / stats_.upstream_zone_count_.value(); - - uint64_t zone_percent_diff = - runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 3); - - // Hosts should be roughly equally distributed between zones. - if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) { - stats_.upstream_zone_above_threshold_.inc(); + // Do not perform zone routing for small clusters. + uint64_t min_cluster_size = + runtime_.snapshot().getInteger("upstream.zone_routing.min_cluster_size", 6U); + if (host_set_.healthyHosts().size() < min_cluster_size) { + stats_.zone_cluster_too_small_.inc(); return host_set_.healthyHosts(); } - stats_.upstream_zone_within_threshold_.inc(); + // If number of hosts in a local zone big enough route all requests to the same zone. + if (host_set_.localZoneHealthyHosts().size() * number_of_zones >= + host_set_.healthyHosts().size()) { + stats_.zone_over_percentage_.inc(); + return host_set_.localZoneHealthyHosts(); + } + + // If local zone ratio is lower than expected we should only partially route requests from the + // same zone. + double zone_host_ratio = + 1.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); + double ratio_to_route = zone_host_ratio * number_of_zones; - uint64_t zone_panic_threshold = - runtime_.snapshot().getInteger("upstream.zone_routing.healthy_panic_threshold", 80); - double zone_healthy_percent = - 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.localZoneHosts().size(); - if (zone_healthy_percent < zone_panic_threshold) { - stats_.upstream_zone_healthy_panic_.inc(); + // Not zone routed requests will be distributed between all hosts and hence + // we need to route only fraction of req_percent_to_route to the local zone. + double actual_routing_ratio = (ratio_to_route - zone_host_ratio) / (1 - zone_host_ratio); + // Scale actual_routing_ratio to improve precision. + const uint64_t scale_factor = 10000; + uint64_t zone_routing_threshold = scale_factor * actual_routing_ratio; + + if (random_.random() % 10000 < zone_routing_threshold) { + stats_.zone_routing_sampled_.inc(); + return host_set_.localZoneHealthyHosts(); + } else { + stats_.zone_routing_no_sampled_.inc(); return host_set_.healthyHosts(); } - - return host_set_.localZoneHealthyHosts(); } ConstHostPtr RoundRobinLoadBalancer::chooseHost() { @@ -71,7 +82,7 @@ ConstHostPtr RoundRobinLoadBalancer::chooseHost() { LeastRequestLoadBalancer::LeastRequestLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime, Runtime::RandomGenerator& random) - : LoadBalancerBase(host_set, stats, runtime), random_(random) { + : LoadBalancerBase(host_set, stats, runtime, random) { host_set.addMemberUpdateCb( [this](const std::vector&, const std::vector& hosts_removed) -> void { if (last_host_) { diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 0e44bd1fc623..048e66bb843b 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -11,8 +11,9 @@ namespace Upstream { */ class LoadBalancerBase { protected: - LoadBalancerBase(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime) - : stats_(stats), runtime_(runtime), host_set_(host_set) {} + LoadBalancerBase(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime, + Runtime::RandomGenerator& random) + : stats_(stats), runtime_(runtime), random_(random), host_set_(host_set) {} /** * Pick the host list to use (healthy or all depending on how many in the set are not healthy). @@ -21,6 +22,7 @@ class LoadBalancerBase { ClusterStats& stats_; Runtime::Loader& runtime_; + Runtime::RandomGenerator& random_; private: const HostSet& host_set_; @@ -31,8 +33,9 @@ class LoadBalancerBase { */ class RoundRobinLoadBalancer : public LoadBalancer, LoadBalancerBase { public: - RoundRobinLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime) - : LoadBalancerBase(host_set, stats, runtime) {} + RoundRobinLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime, + Runtime::RandomGenerator& random) + : LoadBalancerBase(host_set, stats, runtime, random) {} // Upstream::LoadBalancer ConstHostPtr chooseHost() override; @@ -63,7 +66,6 @@ class LeastRequestLoadBalancer : public LoadBalancer, LoadBalancerBase { ConstHostPtr chooseHost() override; private: - Runtime::RandomGenerator& random_; HostPtr last_host_; uint32_t hits_left_{}; }; @@ -75,13 +77,10 @@ class RandomLoadBalancer : public LoadBalancer, LoadBalancerBase { public: RandomLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime, Runtime::RandomGenerator& random) - : LoadBalancerBase(host_set, stats, runtime), random_(random) {} + : LoadBalancerBase(host_set, stats, runtime, random) {} // Upstream::LoadBalancer ConstHostPtr chooseHost() override; - -private: - Runtime::RandomGenerator& random_; }; } // Upstream diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 5f7ccdb1839a..596b0c8fcf8c 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -20,9 +20,10 @@ class RoundRobinLoadBalancerTest : public testing::Test { NiceMock cluster_; NiceMock runtime_; + NiceMock random_; Stats::IsolatedStoreImpl stats_store_; ClusterStats stats_; - RoundRobinLoadBalancer lb_{cluster_, stats_, runtime_}; + RoundRobinLoadBalancer lb_{cluster_, stats_, runtime_, random_}; }; TEST_F(RoundRobinLoadBalancerTest, NoHosts) { EXPECT_EQ(nullptr, lb_.chooseHost()); } @@ -66,7 +67,7 @@ TEST_F(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.upstream_rq_lb_healthy_panic_.value()); } -TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareSmallCluster) { cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; @@ -79,25 +80,76 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(true)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) - .WillRepeatedly(Return(80)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) - .WillRepeatedly(Return(2)); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.min_cluster_size", 6)) + .WillRepeatedly(Return(6)); + + EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(1U, stats_.zone_cluster_too_small_.value()); + EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); + EXPECT_EQ(2U, stats_.zone_cluster_too_small_.value()); +} + +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingLargeZone) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + stats_.upstream_zone_count_.set(3UL); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6)) + .Times(2) + .WillRepeatedly(Return(3)); // There is only one host in the given zone for zone aware routing. EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost()); - EXPECT_EQ(1UL, stats_.upstream_zone_within_threshold_.value()); - + EXPECT_EQ(1U, stats_.zone_over_percentage_.value()); EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost()); - EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); + EXPECT_EQ(2U, stats_.zone_over_percentage_.value()); // Disable runtime global zone routing. EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(false)); EXPECT_EQ(cluster_.healthy_hosts_[2], lb_.chooseHost()); - EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); +} + +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingSmallZone) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:83"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:84")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:83"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:84")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + stats_.upstream_zone_count_.set(3UL); + + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillRepeatedly(Return(50)); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6)) + .Times(2) + .WillRepeatedly(Return(5)); + + // There is only one host in the given zone for zone aware routing. + EXPECT_CALL(random_, random()).WillOnce(Return(1000)); + EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(1U, stats_.zone_routing_sampled_.value()); + EXPECT_CALL(random_, random()).WillOnce(Return(6500)); + EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); + EXPECT_EQ(1U, stats_.zone_routing_no_sampled_.value()); } TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) { @@ -108,15 +160,10 @@ TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) { stats_.upstream_zone_count_.set(1UL); EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)).Times(0); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) - .Times(0); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); - EXPECT_EQ(0UL, stats_.upstream_zone_within_threshold_.value()); - EXPECT_EQ(0UL, stats_.upstream_zone_above_threshold_.value()); } TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { @@ -135,43 +182,11 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); - // Should not be called due to early exit. - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) - .Times(0); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0); - // local zone has no healthy hosts, take from the all healthy hosts. EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); } -TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotEnoughHealthy) { - cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), - newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), - newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; - cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), - newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), - newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; - cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; - cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; - stats_.upstream_zone_count_.set(2UL); - - EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) - .WillRepeatedly(Return(true)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) - .WillRepeatedly(Return(50)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) - .WillRepeatedly(Return(80)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) - .WillRepeatedly(Return(3)); - - // Not enough healthy hosts in local zone. - EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); - EXPECT_EQ(1UL, stats_.upstream_zone_above_threshold_.value()); - EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); - EXPECT_EQ(2UL, stats_.upstream_zone_above_threshold_.value()); -} - class LeastRequestLoadBalancerTest : public testing::Test { public: LeastRequestLoadBalancerTest() : stats_(ClusterImplBase::generateStats("", stats_store_)) {} diff --git a/test/common/upstream/load_balancer_simulation_test.cc b/test/common/upstream/load_balancer_simulation_test.cc index 0648312a2ffd..d33300e28bb4 100644 --- a/test/common/upstream/load_balancer_simulation_test.cc +++ b/test/common/upstream/load_balancer_simulation_test.cc @@ -25,8 +25,8 @@ class DISABLED_SimulationTest : public testing::Test { .WillByDefault(Return(50U)); ON_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillByDefault(Return(true)); - ON_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) - .WillByDefault(Return(3)); + ON_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6)) + .WillByDefault(Return(6)); } /** @@ -111,7 +111,7 @@ class DISABLED_SimulationTest : public testing::Test { return ret; }; - const uint32_t total_number_of_requests = 100000; + const uint32_t total_number_of_requests = 3000000; NiceMock cluster_; NiceMock runtime_; @@ -127,7 +127,19 @@ TEST_F(DISABLED_SimulationTest, strictlyEqualDistribution) { } TEST_F(DISABLED_SimulationTest, unequalZoneDistribution) { + run({1U, 1U, 1U}, {2U, 5U, 5U}, {2U, 5U, 5U}); +} + +TEST_F(DISABLED_SimulationTest, unequalZoneDistribution2) { run({1U, 1U, 1U}, {5U, 5U, 6U}, {5U, 5U, 6U}); } +TEST_F(DISABLED_SimulationTest, unequalZoneDistribution3) { + run({1U, 1U, 1U}, {10U, 10U, 10U}, {10U, 8U, 8U}); +} + +TEST_F(DISABLED_SimulationTest, unequalZoneDistribution4) { + run({20U, 20U, 21U}, {4U, 4U, 5U}, {4U, 5U, 5U}); +} + } // Upstream \ No newline at end of file