Skip to content

Commit

Permalink
Zone routing v2 (#115)
Browse files Browse the repository at this point in the history
  • Loading branch information
RomanDzhabarov authored Oct 10, 2016
1 parent 4abb327 commit 50241f1
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 98 deletions.
10 changes: 0 additions & 10 deletions docs/configuration/cluster_manager/cluster_runtime.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,6 @@ upstream.weight_enabled
upstream.zone_routing.enabled
% of requests that will be routed to the same upstream zone. Defaults to 100% of requests.

upstream.zone_routing.percent_diff
Zone aware routing will be used only if the percent of upstream hosts in the same zone is within
percent_diff of expected. Expected is calculated as 100 / number_of_zones. This prevents Envoy
from using same zone routing if the zones are not balanced well. Defaults to 3% allowed deviation.

upstream.zone_routing.healthy_panic_threshold
Defines the :ref:`zone healthy panic threshold <arch_overview_load_balancing_zone_panic_threshold>`
percentage. Defaults to 80%. If the % of healthy hosts in the current zone falls below this %
all healthy hosts will be used for routing.

circuit_breakers.<cluster_name>.<priority>.max_connections
:ref:`Max connections circuit breaker setting <config_cluster_manager_cluster_circuit_breakers_max_connections>`

Expand Down
5 changes: 2 additions & 3 deletions docs/intro/arch_overview/load_balancing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,5 @@ Zone aware routing and local zone panic threshold
-------------------------------------------------

By default Envoy performs zone aware routing where it will send traffic to the same upstream zone.
This is only done if the zones are well balanced (defaults to 3% allowed deviation) and if there
are enough healthy hosts in the local zone (the *panic threshold* which defaults to 80%). These are
:ref:`configurable <config_cluster_manager_cluster_runtime>` via runtime.
This feature is in active development.
You can enable/disable it :ref:`via runtime <config_cluster_manager_cluster_runtime>`.
9 changes: 5 additions & 4 deletions include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ class HostSet {
COUNTER(update_attempt) \
COUNTER(update_success) \
COUNTER(update_failure) \
COUNTER(zone_cluster_too_small) \
COUNTER(zone_over_percentage) \
COUNTER(zone_routing_sampled) \
COUNTER(zone_routing_no_sampled) \
GAUGE (max_host_weight) \
GAUGE (upstream_zone_count) \
COUNTER(upstream_zone_above_threshold) \
COUNTER(upstream_zone_healthy_panic) \
COUNTER(upstream_zone_within_threshold)
GAUGE (upstream_zone_count)
// clang-format on

/**
Expand Down
2 changes: 1 addition & 1 deletion source/common/upstream/cluster_manager_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::ClusterEntry(
break;
}
case LoadBalancerType::RoundRobin: {
lb_.reset(new RoundRobinLoadBalancer(host_set_, cluster.stats(), runtime));
lb_.reset(new RoundRobinLoadBalancer(host_set_, cluster.stats(), runtime, random));
break;
}
}
Expand Down
53 changes: 32 additions & 21 deletions source/common/upstream/load_balancer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,39 +24,50 @@ const std::vector<HostPtr>& LoadBalancerBase::hostsToUse() {
return host_set_.hosts();
}

uint32_t number_of_zones = stats_.upstream_zone_count_.value();
// Early exit if we cannot perform zone aware routing.
if (stats_.upstream_zone_count_.value() < 2 || host_set_.localZoneHealthyHosts().empty() ||
if (number_of_zones < 2 || host_set_.localZoneHealthyHosts().empty() ||
!runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 100)) {
return host_set_.healthyHosts();
}

double zone_to_all_percent =
100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size();
double expected_percent = 100.0 / stats_.upstream_zone_count_.value();

uint64_t zone_percent_diff =
runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 3);

// Hosts should be roughly equally distributed between zones.
if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) {
stats_.upstream_zone_above_threshold_.inc();
// Do not perform zone routing for small clusters.
uint64_t min_cluster_size =
runtime_.snapshot().getInteger("upstream.zone_routing.min_cluster_size", 6U);

if (host_set_.healthyHosts().size() < min_cluster_size) {
stats_.zone_cluster_too_small_.inc();
return host_set_.healthyHosts();
}

stats_.upstream_zone_within_threshold_.inc();
// If number of hosts in a local zone big enough route all requests to the same zone.
if (host_set_.localZoneHealthyHosts().size() * number_of_zones >=
host_set_.healthyHosts().size()) {
stats_.zone_over_percentage_.inc();
return host_set_.localZoneHealthyHosts();
}

// If local zone ratio is lower than expected we should only partially route requests from the
// same zone.
double zone_host_ratio =
1.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size();
double ratio_to_route = zone_host_ratio * number_of_zones;

uint64_t zone_panic_threshold =
runtime_.snapshot().getInteger("upstream.zone_routing.healthy_panic_threshold", 80);
double zone_healthy_percent =
100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.localZoneHosts().size();
if (zone_healthy_percent < zone_panic_threshold) {
stats_.upstream_zone_healthy_panic_.inc();
// Not zone routed requests will be distributed between all hosts and hence
// we need to route only fraction of req_percent_to_route to the local zone.
double actual_routing_ratio = (ratio_to_route - zone_host_ratio) / (1 - zone_host_ratio);

// Scale actual_routing_ratio to improve precision.
const uint64_t scale_factor = 10000;
uint64_t zone_routing_threshold = scale_factor * actual_routing_ratio;

if (random_.random() % 10000 < zone_routing_threshold) {
stats_.zone_routing_sampled_.inc();
return host_set_.localZoneHealthyHosts();
} else {
stats_.zone_routing_no_sampled_.inc();
return host_set_.healthyHosts();
}

return host_set_.localZoneHealthyHosts();
}

ConstHostPtr RoundRobinLoadBalancer::chooseHost() {
Expand All @@ -71,7 +82,7 @@ ConstHostPtr RoundRobinLoadBalancer::chooseHost() {
LeastRequestLoadBalancer::LeastRequestLoadBalancer(const HostSet& host_set, ClusterStats& stats,
Runtime::Loader& runtime,
Runtime::RandomGenerator& random)
: LoadBalancerBase(host_set, stats, runtime), random_(random) {
: LoadBalancerBase(host_set, stats, runtime, random) {
host_set.addMemberUpdateCb(
[this](const std::vector<HostPtr>&, const std::vector<HostPtr>& hosts_removed) -> void {
if (last_host_) {
Expand Down
17 changes: 8 additions & 9 deletions source/common/upstream/load_balancer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ namespace Upstream {
*/
class LoadBalancerBase {
protected:
LoadBalancerBase(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime)
: stats_(stats), runtime_(runtime), host_set_(host_set) {}
LoadBalancerBase(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime,
Runtime::RandomGenerator& random)
: stats_(stats), runtime_(runtime), random_(random), host_set_(host_set) {}

/**
* Pick the host list to use (healthy or all depending on how many in the set are not healthy).
Expand All @@ -21,6 +22,7 @@ class LoadBalancerBase {

ClusterStats& stats_;
Runtime::Loader& runtime_;
Runtime::RandomGenerator& random_;

private:
const HostSet& host_set_;
Expand All @@ -31,8 +33,9 @@ class LoadBalancerBase {
*/
class RoundRobinLoadBalancer : public LoadBalancer, LoadBalancerBase {
public:
RoundRobinLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime)
: LoadBalancerBase(host_set, stats, runtime) {}
RoundRobinLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime,
Runtime::RandomGenerator& random)
: LoadBalancerBase(host_set, stats, runtime, random) {}

// Upstream::LoadBalancer
ConstHostPtr chooseHost() override;
Expand Down Expand Up @@ -63,7 +66,6 @@ class LeastRequestLoadBalancer : public LoadBalancer, LoadBalancerBase {
ConstHostPtr chooseHost() override;

private:
Runtime::RandomGenerator& random_;
HostPtr last_host_;
uint32_t hits_left_{};
};
Expand All @@ -75,13 +77,10 @@ class RandomLoadBalancer : public LoadBalancer, LoadBalancerBase {
public:
RandomLoadBalancer(const HostSet& host_set, ClusterStats& stats, Runtime::Loader& runtime,
Runtime::RandomGenerator& random)
: LoadBalancerBase(host_set, stats, runtime), random_(random) {}
: LoadBalancerBase(host_set, stats, runtime, random) {}

// Upstream::LoadBalancer
ConstHostPtr chooseHost() override;

private:
Runtime::RandomGenerator& random_;
};

} // Upstream
109 changes: 62 additions & 47 deletions test/common/upstream/load_balancer_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ class RoundRobinLoadBalancerTest : public testing::Test {

NiceMock<MockCluster> cluster_;
NiceMock<Runtime::MockLoader> runtime_;
NiceMock<Runtime::MockRandomGenerator> random_;
Stats::IsolatedStoreImpl stats_store_;
ClusterStats stats_;
RoundRobinLoadBalancer lb_{cluster_, stats_, runtime_};
RoundRobinLoadBalancer lb_{cluster_, stats_, runtime_, random_};
};

TEST_F(RoundRobinLoadBalancerTest, NoHosts) { EXPECT_EQ(nullptr, lb_.chooseHost()); }
Expand Down Expand Up @@ -66,7 +67,7 @@ TEST_F(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) {
EXPECT_EQ(3UL, stats_.upstream_rq_lb_healthy_panic_.value());
}

TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) {
TEST_F(RoundRobinLoadBalancerTest, ZoneAwareSmallCluster) {
cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")};
Expand All @@ -79,25 +80,76 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) {

EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillRepeatedly(Return(true));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80))
.WillRepeatedly(Return(80));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3))
.WillRepeatedly(Return(2));
EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.min_cluster_size", 6))
.WillRepeatedly(Return(6));

EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(1U, stats_.zone_cluster_too_small_.value());
EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost());
EXPECT_EQ(2U, stats_.zone_cluster_too_small_.value());
}

TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingLargeZone) {
cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")};
cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")};
cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
stats_.upstream_zone_count_.set(3UL);

EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50))
.WillRepeatedly(Return(50));
EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillRepeatedly(Return(true));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6))
.Times(2)
.WillRepeatedly(Return(3));

// There is only one host in the given zone for zone aware routing.
EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(1UL, stats_.upstream_zone_within_threshold_.value());

EXPECT_EQ(1U, stats_.zone_over_percentage_.value());
EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value());
EXPECT_EQ(2U, stats_.zone_over_percentage_.value());

// Disable runtime global zone routing.
EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillRepeatedly(Return(false));
EXPECT_EQ(cluster_.healthy_hosts_[2], lb_.chooseHost());
EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value());
}

TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingSmallZone) {
cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:83"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:84")};
cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:83"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:84")};
cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
stats_.upstream_zone_count_.set(3UL);

EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50))
.WillRepeatedly(Return(50));
EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillRepeatedly(Return(true));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6))
.Times(2)
.WillRepeatedly(Return(5));

// There is only one host in the given zone for zone aware routing.
EXPECT_CALL(random_, random()).WillOnce(Return(1000));
EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(1U, stats_.zone_routing_sampled_.value());
EXPECT_CALL(random_, random()).WillOnce(Return(6500));
EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost());
EXPECT_EQ(1U, stats_.zone_routing_no_sampled_.value());
}

TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) {
Expand All @@ -108,15 +160,10 @@ TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) {
stats_.upstream_zone_count_.set(1UL);

EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)).Times(0);
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80))
.Times(0);
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0);
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50))
.WillRepeatedly(Return(50));

EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(0UL, stats_.upstream_zone_within_threshold_.value());
EXPECT_EQ(0UL, stats_.upstream_zone_above_threshold_.value());
}

TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) {
Expand All @@ -135,43 +182,11 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) {
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50))
.WillRepeatedly(Return(50));

// Should not be called due to early exit.
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80))
.Times(0);
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0);

// local zone has no healthy hosts, take from the all healthy hosts.
EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost());
}

TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotEnoughHealthy) {
cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")};
cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"),
newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")};
cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")};
stats_.upstream_zone_count_.set(2UL);

EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillRepeatedly(Return(true));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50))
.WillRepeatedly(Return(50));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80))
.WillRepeatedly(Return(80));
EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3))
.WillRepeatedly(Return(3));

// Not enough healthy hosts in local zone.
EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost());
EXPECT_EQ(1UL, stats_.upstream_zone_above_threshold_.value());
EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost());
EXPECT_EQ(2UL, stats_.upstream_zone_above_threshold_.value());
}

class LeastRequestLoadBalancerTest : public testing::Test {
public:
LeastRequestLoadBalancerTest() : stats_(ClusterImplBase::generateStats("", stats_store_)) {}
Expand Down
18 changes: 15 additions & 3 deletions test/common/upstream/load_balancer_simulation_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ class DISABLED_SimulationTest : public testing::Test {
.WillByDefault(Return(50U));
ON_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100))
.WillByDefault(Return(true));
ON_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3))
.WillByDefault(Return(3));
ON_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6))
.WillByDefault(Return(6));
}

/**
Expand Down Expand Up @@ -111,7 +111,7 @@ class DISABLED_SimulationTest : public testing::Test {
return ret;
};

const uint32_t total_number_of_requests = 100000;
const uint32_t total_number_of_requests = 3000000;

NiceMock<MockCluster> cluster_;
NiceMock<Runtime::MockLoader> runtime_;
Expand All @@ -127,7 +127,19 @@ TEST_F(DISABLED_SimulationTest, strictlyEqualDistribution) {
}

TEST_F(DISABLED_SimulationTest, unequalZoneDistribution) {
run({1U, 1U, 1U}, {2U, 5U, 5U}, {2U, 5U, 5U});
}

TEST_F(DISABLED_SimulationTest, unequalZoneDistribution2) {
run({1U, 1U, 1U}, {5U, 5U, 6U}, {5U, 5U, 6U});
}

TEST_F(DISABLED_SimulationTest, unequalZoneDistribution3) {
run({1U, 1U, 1U}, {10U, 10U, 10U}, {10U, 8U, 8U});
}

TEST_F(DISABLED_SimulationTest, unequalZoneDistribution4) {
run({20U, 20U, 21U}, {4U, 4U, 5U}, {4U, 5U, 5U});
}

} // Upstream

0 comments on commit 50241f1

Please sign in to comment.