From 5acc480382308ba974abdba03033f10bdaf200cd Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 27 Aug 2019 13:50:26 +0000 Subject: [PATCH 01/12] upstream: add failure percentage-based outlier detection Signed-off-by: James Forcier --- .../api/v2/cluster/outlier_detection.proto | 29 ++ .../v2alpha/outlier_detection_event.proto | 12 + .../cluster_manager/cluster_runtime.rst | 25 ++ .../cluster_manager/cluster_stats.rst | 4 + .../intro/arch_overview/upstream/outlier.rst | 27 ++ docs/root/intro/version_history.rst | 1 + .../common/upstream/outlier_detection_impl.cc | 96 +++++++ .../common/upstream/outlier_detection_impl.h | 17 ++ .../upstream/outlier_detection_impl_test.cc | 250 ++++++++++++++++++ 9 files changed, 461 insertions(+) diff --git a/api/envoy/api/v2/cluster/outlier_detection.proto b/api/envoy/api/v2/cluster/outlier_detection.proto index 69feb60e2325..4752bdecd792 100644 --- a/api/envoy/api/v2/cluster/outlier_detection.proto +++ b/api/envoy/api/v2/cluster/outlier_detection.proto @@ -114,4 +114,33 @@ message OutlierDetection { // is set to true. google.protobuf.UInt32Value enforcing_local_origin_success_rate = 15 [(validate.rules).uint32.lte = 100]; + + // The failure percentage to use when determining failure percentage-based outlier detection. If + // the failure percentage of a given host is greater than or equal to this value, it will be + // ejected. Defaults to 85. + google.protobuf.UInt32Value failure_percentage_threshold = 16 + [(validate.rules).uint32.lte = 100]; + + // The % chance that a host will be actually ejected when an outlier status is detected through + // failure precentage statistics. This setting can be used to disable ejection or to ramp it up + // slowly. Defaults to 0. + google.protobuf.UInt32Value enforcing_failure_percentage = 17 + [(validate.rules).uint32.lte = 100]; + + // The % chance that a host will be actually ejected when an outlier status is detected through + // local-origin failure precentage statistics. This setting can be used to disable ejection or to + // ramp it up slowly. Defaults to 0. + google.protobuf.UInt32Value enforcing_failure_percentage_local_origin = 18 + [(validate.rules).uint32.lte = 100]; + + // The minimum number of hosts in a cluster in order to perform failure percentage-based ejection. + // If the total number of hosts in the cluster is less than this value, failure percentage-based + // ejection will not be performed. Defaults to 5. + google.protobuf.UInt32Value failure_percentage_minimum_hosts = 19; + + // The minimum number of total requests that must be collected in one interval (as defined by the + // interval duration above) to perform failure percentage-based ejection for this host. If the + // volume is lower than this setting, failure percentage-based ejection will not be performed for + // this host. Defaults to 50. + google.protobuf.UInt32Value failure_percentage_request_volume = 20; } diff --git a/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto b/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto index 65559abc5485..7dd4980ba972 100644 --- a/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto +++ b/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto @@ -42,6 +42,7 @@ message OutlierDetectionEvent { option (validate.required) = true; OutlierEjectSuccessRate eject_success_rate_event = 9; OutlierEjectConsecutive eject_consecutive_event = 10; + OutlierEjectFailurePercentage eject_failure_percentage_event = 11; } } @@ -78,6 +79,12 @@ enum OutlierEjectionType { // is set to *true*. // See :ref:`Cluster outlier detection ` documentation for SUCCESS_RATE_LOCAL_ORIGIN = 4; + // Runs over aggregated success rate statistics from every host in cluster and selects hosts for + // which ratio of failed replies is above configured value. + FAILURE_PERCENTAGE = 5; + // Runs over aggregated success rate statistics for local origin failures from every host in + // cluster and selects hosts for which ratio of failed replies is above configured value. + FAILURE_PERCENTAGE_LOCAL_ORIGIN = 6; } // Represents possible action applied to upstream host @@ -100,3 +107,8 @@ message OutlierEjectSuccessRate { message OutlierEjectConsecutive { } + +message OutlierEjectFailurePercentage { + // Host's success rate at the time of the ejection event on a 0-100 range. + uint32 host_success_rate = 1 [(validate.rules).uint32.lte = 100]; +} diff --git a/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst b/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst index a64750cf6625..195d025c24bc 100644 --- a/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst +++ b/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst @@ -102,6 +102,31 @@ outlier_detection.success_rate_stdev_factor ` setting in outlier detection +outlier_detection.enforcing_failure_percentage + :ref:`enforcing_failure_percentage + ` + setting in outlier detection + +outlier_detection.enforcing_failure_percentage_local_origin + :ref:`enforcing_failure_percentage_local_origin + ` + setting in outlier detection + +outlier_detection.failure_percentage_request_volume + :ref:`failure_percentage_request_volume + ` + setting in outlier detection + +outlier_detection.failure_percentage_minimum_hosts + :ref:`failure_percentage_minimum_hosts + ` + setting in outlier detection + +outlier_detection.failure_percentage_threshold + :ref:`failure_percentage_threshold + ` + setting in outlier detection + Core ---- diff --git a/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst b/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst index f4f1890baf8d..57bd438109b6 100644 --- a/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst @@ -141,6 +141,10 @@ statistics will be rooted at *cluster..outlier_detection.* and contain the ejections_detected_consecutive_local_origin_failure, Counter, Number of detected consecutive local origin failure ejections (even if unenforced) ejections_enforced_local_origin_success_rate, Counter, Number of enforced success rate outlier ejections for locally originated failures ejections_detected_local_origin_success_rate, Counter, Number of detected success rate outlier ejections for locally originated failures (even if unenforced) + ejections_enforced_failure_percentage, Counter, Number of enforced failure percentage outlier ejections. Exact meaning of this counter depends on :ref:`outlier_detection.split_external_local_origin_errors` config item. Refer to :ref:`Outlier Detection documentation` for details. + ejections_detected_failure_percentage, Counter, Number of detected failure percentage outlier ejections (even if unenforced). Exact meaning of this counter depends on :ref:`outlier_detection.split_external_local_origin_errors` config item. Refer to :ref:`Outlier Detection documentation` for details. + ejections_enforced_failure_percentage_local_origin, Counter, Number of enforced failure percentage outlier ejections for locally originated failures + ejections_detected_failure_percentage_local_origin, Counter, Number of detected failure percentage outlier ejections for locally originated failures (even if unenforced) ejections_total, Counter, Deprecated. Number of ejections due to any outlier type (even if unenforced) ejections_consecutive_5xx, Counter, Deprecated. Number of consecutive 5xx ejections (even if unenforced) diff --git a/docs/root/intro/arch_overview/upstream/outlier.rst b/docs/root/intro/arch_overview/upstream/outlier.rst index b8b4fce4b7a1..856f231fbf07 100644 --- a/docs/root/intro/arch_overview/upstream/outlier.rst +++ b/docs/root/intro/arch_overview/upstream/outlier.rst @@ -145,6 +145,33 @@ Most configuration items, namely types of errors, but :ref:`outlier_detection.enforcing_success_rate` applies to externally originated errors only and :ref:`outlier_detection.enforcing_local_origin_success_rate` applies to locally originated errors only. +.. _arch_overview_outlier_detection_failure_percentage: + +Failure Percentage +^^^^^^^^^^^^^^^^^^ + +Failure Percentage based outlier ejection functions similarly to the success rate detecion type, in +that it relies on success rate data from each host in a cluster. However, rather than compare those +values to the mean success rate of the cluster as a whole, they are compared to a flat +user-configured threshold. This threshold is configured via the +:ref:`outlier_detection.failure_percentage_threshold` +field. + +The other configuration fields for failure percentage based ejection are similar to the fields for +success rate ejection. Failure percentage based ejection also obeys +:ref:`outlier_detection.split_external_local_origin_errors`; +the enforcement percentages for externally- and locally-originated errors are controlled by +:ref:`outlier_detection.enforcing_failure_percentage` +and +:ref:`outlier_detection.enforcing_failure_percentage_local_origin`, +respectively. As with success rate detection, detection will not be performed for a host if its +request volume over the aggregation interval is less than the +:ref:`outlier_detection.failure_percentage_request_volume` +value. Detection also will not be performed for a cluster if the number of hosts with the minimum +required request volume in an interval is less than the +:ref:`outlier_detection.failure_percentage_minimum_hosts` +value. + .. _arch_overview_outlier_detection_logging: diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 653bc4f65d6d..eb645cc38d0e 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -53,6 +53,7 @@ Version history * tracing: added tags for gRPC response status and meesage. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. * upstream: added network filter chains to upstream connections, see :ref:`filters`. +* upstream: added new :ref:`failure-percentage based outlier detection( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, success_rate_stdev_factor, 1900))), + failure_percentage_threshold_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_threshold, 85))), + failure_percentage_minimum_hosts_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_minimum_hosts, 5))), + failure_percentage_request_volume_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_request_volume, 50))), enforcing_consecutive_5xx_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_5xx, 100))), enforcing_consecutive_gateway_failure_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_gateway_failure, 0))), enforcing_success_rate_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_success_rate, 100))), + enforcing_failure_percentage_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage, 0))), + enforcing_failure_percentage_local_origin_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage_local_origin, 0))), split_external_local_origin_errors_(config.split_external_local_origin_errors()), consecutive_local_origin_failure_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, consecutive_local_origin_failure, 5))), @@ -355,6 +365,14 @@ bool DetectorImpl::enforceEjection(envoy::data::cluster::v2alpha::OutlierEjectio return runtime_.snapshot().featureEnabled( "outlier_detection.enforcing_local_origin_success_rate", config_.enforcingLocalOriginSuccessRate()); + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + return runtime_.snapshot().featureEnabled( + "outlier_detection.enforcing_failure_percentage", + config_.enforcingFailurePercentage()); + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + return runtime_.snapshot().featureEnabled( + "outlier_detection.enforcing_failure_percentage_local_origin", + config_.enforcingFailurePercentageLocalOrigin()); default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -382,6 +400,12 @@ void DetectorImpl::updateEnforcedEjectionStats( case envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN: stats_.ejections_enforced_local_origin_success_rate_.inc(); break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + stats_.ejections_enforced_failure_percentage_.inc(); + break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + stats_.ejections_enforced_local_origin_failure_percentage_.inc(); + break; default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -406,6 +430,12 @@ void DetectorImpl::updateDetectedEjectionStats( case envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN: stats_.ejections_detected_local_origin_success_rate_.inc(); break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + stats_.ejections_detected_failure_percentage_.inc(); + break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + stats_.ejections_detected_local_origin_failure_percentage_.inc(); + break; default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -609,6 +639,61 @@ void DetectorImpl::processSuccessRateEjections( } } +void DetectorImpl::processFailurePercentageEjections( + DetectorHostMonitor::SuccessRateMonitorType monitor_type) { + uint64_t failure_percentage_minimum_hosts = runtime_.snapshot().getInteger( + "outlier_detection.failure_percentage_minimum_hosts", + config_.failurePercentageMinimumHosts()); + uint64_t failure_percentage_request_volume = runtime_.snapshot().getInteger( + "outlier_detection.failure_percentage_request_volume", + config_.failurePercentageRequestVolume()); + std::vector valid_failure_percentage_hosts; + + // Exit early if there are not enough hosts. + if (host_monitors_.size() < failure_percentage_minimum_hosts) { + return; + } + + // reserve upper bound of vector size to avoid reallocation. + valid_failure_percentage_hosts.reserve(host_monitors_.size()); + + for (const auto& host : host_monitors_) { + if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { + absl::optional host_success_rate = host.second->getSRMonitor(monitor_type) + .successRateAccumulator() + .getSuccessRate(failure_percentage_request_volume); + + if (host_success_rate) { + valid_failure_percentage_hosts.emplace_back( + HostSuccessRatePair(host.first, host_success_rate.value())); + } + } + } + + if (!valid_failure_percentage_hosts.empty() && + valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) { + const double failure_percentage_threshold = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_threshold", + config_.failurePercentageThreshold()) / + 100.0; + + for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) { + if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) { + // We should eject. + + // The ejection type returned by the SuccessRateMonitor's getEjectionType() will be a + // SUCCESS_RATE type, so we need to figure it out for ourselves. + const envoy::data::cluster::v2alpha::OutlierEjectionType type = + (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin) + ? envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE + : envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN; + updateDetectedEjectionStats(type); + ejectHost(host_success_rate_pair.host_, type); + } + } + } +} + void DetectorImpl::onIntervalTimer() { MonotonicTime now = time_source_.monotonicTime(); @@ -626,6 +711,9 @@ void DetectorImpl::onIntervalTimer() { processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); + processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); + processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); + armIntervalTimer(); } @@ -660,6 +748,14 @@ void EventLoggerImpl::logEject(const HostDescriptionConstSharedPtr& host, Detect detector.successRateEjectionThreshold(monitor_type)); event.mutable_eject_success_rate_event()->set_host_success_rate( host->outlierDetector().successRate(monitor_type)); + } else if ((type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) || + (type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN)) { + const DetectorHostMonitor::SuccessRateMonitorType monitor_type = + (type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) + ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin + : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin; + event.mutable_eject_failure_percentage_event()->set_host_success_rate( + host->outlierDetector().successRate(monitor_type)); } else { event.mutable_eject_consecutive_event(); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 2b703c2adba4..cc1860f05c5a 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -214,13 +214,17 @@ class DetectorHostMonitorImpl : public DetectorHostMonitor { COUNTER(ejections_detected_consecutive_5xx) \ COUNTER(ejections_detected_consecutive_gateway_failure) \ COUNTER(ejections_detected_success_rate) \ + COUNTER(ejections_detected_failure_percentage) \ COUNTER(ejections_enforced_consecutive_5xx) \ COUNTER(ejections_enforced_consecutive_gateway_failure) \ COUNTER(ejections_enforced_success_rate) \ + COUNTER(ejections_enforced_failure_percentage) \ COUNTER(ejections_detected_consecutive_local_origin_failure) \ COUNTER(ejections_enforced_consecutive_local_origin_failure) \ COUNTER(ejections_detected_local_origin_success_rate) \ COUNTER(ejections_enforced_local_origin_success_rate) \ + COUNTER(ejections_detected_local_origin_failure_percentage) \ + COUNTER(ejections_enforced_local_origin_failure_percentage) \ COUNTER(ejections_enforced_total) \ COUNTER(ejections_overflow) \ COUNTER(ejections_success_rate) \ @@ -249,11 +253,18 @@ class DetectorConfig { uint64_t successRateMinimumHosts() const { return success_rate_minimum_hosts_; } uint64_t successRateRequestVolume() const { return success_rate_request_volume_; } uint64_t successRateStdevFactor() const { return success_rate_stdev_factor_; } + uint64_t failurePercentageThreshold() const { return failure_percentage_threshold_; } + uint64_t failurePercentageMinimumHosts() const { return failure_percentage_minimum_hosts_; } + uint64_t failurePercentageRequestVolume() const { return failure_percentage_request_volume_; } uint64_t enforcingConsecutive5xx() const { return enforcing_consecutive_5xx_; } uint64_t enforcingConsecutiveGatewayFailure() const { return enforcing_consecutive_gateway_failure_; } uint64_t enforcingSuccessRate() const { return enforcing_success_rate_; } + uint64_t enforcingFailurePercentage() const { return enforcing_failure_percentage_; } + uint64_t enforcingFailurePercentageLocalOrigin() const { + return enforcing_failure_percentage_local_origin_; + } bool splitExternalLocalOriginErrors() const { return split_external_local_origin_errors_; } uint64_t consecutiveLocalOriginFailure() const { return consecutive_local_origin_failure_; } uint64_t enforcingConsecutiveLocalOriginFailure() const { @@ -270,9 +281,14 @@ class DetectorConfig { const uint64_t success_rate_minimum_hosts_; const uint64_t success_rate_request_volume_; const uint64_t success_rate_stdev_factor_; + const uint64_t failure_percentage_threshold_; + const uint64_t failure_percentage_minimum_hosts_; + const uint64_t failure_percentage_request_volume_; const uint64_t enforcing_consecutive_5xx_; const uint64_t enforcing_consecutive_gateway_failure_; const uint64_t enforcing_success_rate_; + const uint64_t enforcing_failure_percentage_; + const uint64_t enforcing_failure_percentage_local_origin_; const bool split_external_local_origin_errors_; const uint64_t consecutive_local_origin_failure_; const uint64_t enforcing_consecutive_local_origin_failure_; @@ -348,6 +364,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_thisconfig().successRateMinimumHosts()); EXPECT_EQ(200UL, detector->config().successRateRequestVolume()); EXPECT_EQ(3000UL, detector->config().successRateStdevFactor()); + EXPECT_EQ(0UL, detector->config().enforcingFailurePercentage()); + EXPECT_EQ(0UL, detector->config().enforcingFailurePercentageLocalOrigin()); + EXPECT_EQ(10UL, detector->config().failurePercentageMinimumHosts()); + EXPECT_EQ(25UL, detector->config().failurePercentageRequestVolume()); + EXPECT_EQ(70UL, detector->config().failurePercentageThreshold()); } TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -1056,6 +1064,248 @@ TEST_F(OutlierDetectorImplTest, EmptySuccessRate) { interval_timer_->invokeCallback(); } +TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { + EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); + addHosts({ + "tcp://127.0.0.1:80", + "tcp://127.0.0.1:81", + "tcp://127.0.0.1:82", + "tcp://127.0.0.1:83", + "tcp://127.0.0.1:84", + }); + + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + std::shared_ptr detector(DetectorImpl::create( + cluster_, empty_outlier_detection_, dispatcher_, runtime_, time_system_, event_logger_)); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); + + // Turn off 5xx detection and SR detection to test FP detection in isolation. + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_consecutive_gateway_failure", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_success_rate", 100)) + .WillByDefault(Return(false)); + // Now turn on FP detection. + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_failure_percentage", 0)) + .WillByDefault(Return(true)); + // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter + // gets saturated (every 5 times). + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[3]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(50); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[3]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(50); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(60); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(60); + + // Cause a FP error on one host. First 3 hosts have perfect FP; fourth host has failure percentage + // slightly below threshold; fifth has FP slightly above threshold. + loadRq(hosts_, 50, 200); + loadRq(hosts_[3], 250, 503); + loadRq(hosts_[4], 300, 503); + + time_system_.setMonotonicTime(std::chrono::milliseconds(10000)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), + _, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE, + true)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.success_rate_stdev_factor", 1900)) + .WillByDefault(Return(1900)); + interval_timer_->invokeCallback(); + EXPECT_FLOAT_EQ(100.0 * (50.0 / 300.0), + hosts_[3]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_FLOAT_EQ(100.0 * (50.0 / 350.0), + hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + // Make sure that local origin success rate monitor is not affected + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that doesn't bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(19999)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that does bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(50001)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_FALSE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + + // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter + // gets saturated (every 5 times). + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(5); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(5); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + loadRq(hosts_, 25, 200); + loadRq(hosts_[4], 25, 503); + + time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateAverage( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); +} + +TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { + EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); + addHosts({ + "tcp://127.0.0.1:80", + "tcp://127.0.0.1:81", + "tcp://127.0.0.1:82", + "tcp://127.0.0.1:83", + "tcp://127.0.0.1:84", + }); + + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + std::shared_ptr detector(DetectorImpl::create( + cluster_, outlier_detection_split_, dispatcher_, runtime_, time_system_, event_logger_)); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); + + // Turn off 5xx detection and SR detection to test FP detection in isolation. + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_consecutive_local_origin_failure", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_local_origin_success_rate", 100)) + .WillByDefault(Return(false)); + // Now turn on FP detection. + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_failure_percentage_local_origin", 0)) + .WillByDefault(Return(true)); + // Expect non-enforcing logging to happen every time the consecutive_ counter + // gets saturated (every 5 times). + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, + false)) + .Times(40); + // Cause a SR error on one host. First have 4 of the hosts have perfect SR. + loadRq(hosts_, 200, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); + loadRq(hosts_[4], 200, Result::LOCAL_ORIGIN_CONNECT_FAILED); + + time_system_.setMonotonicTime(std::chrono::milliseconds(10000)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN, + true)); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN, + false)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.failure_percentage_threshold", 85)) + .WillByDefault(Return(40)); + interval_timer_->invokeCallback(); + EXPECT_EQ(50, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(90, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(52, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + // Make sure that external origin success rate monitor is not affected + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateAverage( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that doesn't bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(19999)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that does bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(50001)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_FALSE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + + // Expect non-enforcing logging to happen every time the consecutive_ counter + // gets saturated (every 5 times). + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, + false)) + .Times(5); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + loadRq(hosts_, 25, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); + loadRq(hosts_[4], 25, Result::LOCAL_ORIGIN_CONNECT_FAILED); + + time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); +} + TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); addHosts({"tcp://127.0.0.1:80"}); From b066c4342849aee3bb90671559220e7dce7b2cf3 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 3 Sep 2019 20:34:58 +0000 Subject: [PATCH 02/12] Appease format/docs issues Signed-off-by: James Forcier --- .../api/v2/cluster/outlier_detection.proto | 6 +-- docs/root/intro/version_history.rst | 2 +- .../common/upstream/outlier_detection_impl.cc | 40 ++++++++++--------- .../upstream/outlier_detection_impl_test.cc | 17 ++++---- 4 files changed, 31 insertions(+), 34 deletions(-) diff --git a/api/envoy/api/v2/cluster/outlier_detection.proto b/api/envoy/api/v2/cluster/outlier_detection.proto index 4752bdecd792..cf9c3b41f10f 100644 --- a/api/envoy/api/v2/cluster/outlier_detection.proto +++ b/api/envoy/api/v2/cluster/outlier_detection.proto @@ -118,14 +118,12 @@ message OutlierDetection { // The failure percentage to use when determining failure percentage-based outlier detection. If // the failure percentage of a given host is greater than or equal to this value, it will be // ejected. Defaults to 85. - google.protobuf.UInt32Value failure_percentage_threshold = 16 - [(validate.rules).uint32.lte = 100]; + google.protobuf.UInt32Value failure_percentage_threshold = 16 [(validate.rules).uint32.lte = 100]; // The % chance that a host will be actually ejected when an outlier status is detected through // failure precentage statistics. This setting can be used to disable ejection or to ramp it up // slowly. Defaults to 0. - google.protobuf.UInt32Value enforcing_failure_percentage = 17 - [(validate.rules).uint32.lte = 100]; + google.protobuf.UInt32Value enforcing_failure_percentage = 17 [(validate.rules).uint32.lte = 100]; // The % chance that a host will be actually ejected when an outlier status is detected through // local-origin failure precentage statistics. This setting can be used to disable ejection or to diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index eb645cc38d0e..2856a81ca25a 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -53,7 +53,7 @@ Version history * tracing: added tags for gRPC response status and meesage. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. * upstream: added network filter chains to upstream connections, see :ref:`filters`. -* upstream: added new :ref:`failure-percentage based outlier detection` mode. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * zookeeper: parse responses and emit latency stats. diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index ed4fcf77c3f8..fae7d111c44f 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -366,9 +366,8 @@ bool DetectorImpl::enforceEjection(envoy::data::cluster::v2alpha::OutlierEjectio "outlier_detection.enforcing_local_origin_success_rate", config_.enforcingLocalOriginSuccessRate()); case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: - return runtime_.snapshot().featureEnabled( - "outlier_detection.enforcing_failure_percentage", - config_.enforcingFailurePercentage()); + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_failure_percentage", + config_.enforcingFailurePercentage()); case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: return runtime_.snapshot().featureEnabled( "outlier_detection.enforcing_failure_percentage_local_origin", @@ -641,12 +640,12 @@ void DetectorImpl::processSuccessRateEjections( void DetectorImpl::processFailurePercentageEjections( DetectorHostMonitor::SuccessRateMonitorType monitor_type) { - uint64_t failure_percentage_minimum_hosts = runtime_.snapshot().getInteger( - "outlier_detection.failure_percentage_minimum_hosts", - config_.failurePercentageMinimumHosts()); - uint64_t failure_percentage_request_volume = runtime_.snapshot().getInteger( - "outlier_detection.failure_percentage_request_volume", - config_.failurePercentageRequestVolume()); + uint64_t failure_percentage_minimum_hosts = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_minimum_hosts", + config_.failurePercentageMinimumHosts()); + uint64_t failure_percentage_request_volume = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_request_volume", + config_.failurePercentageRequestVolume()); std::vector valid_failure_percentage_hosts; // Exit early if there are not enough hosts. @@ -659,9 +658,10 @@ void DetectorImpl::processFailurePercentageEjections( for (const auto& host : host_monitors_) { if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - absl::optional host_success_rate = host.second->getSRMonitor(monitor_type) - .successRateAccumulator() - .getSuccessRate(failure_percentage_request_volume); + absl::optional host_success_rate = + host.second->getSRMonitor(monitor_type) + .successRateAccumulator() + .getSuccessRate(failure_percentage_request_volume); if (host_success_rate) { valid_failure_percentage_hosts.emplace_back( @@ -673,9 +673,9 @@ void DetectorImpl::processFailurePercentageEjections( if (!valid_failure_percentage_hosts.empty() && valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) { const double failure_percentage_threshold = - runtime_.snapshot().getInteger("outlier_detection.failure_percentage_threshold", - config_.failurePercentageThreshold()) / - 100.0; + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_threshold", + config_.failurePercentageThreshold()) / + 100.0; for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) { if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) { @@ -684,9 +684,10 @@ void DetectorImpl::processFailurePercentageEjections( // The ejection type returned by the SuccessRateMonitor's getEjectionType() will be a // SUCCESS_RATE type, so we need to figure it out for ourselves. const envoy::data::cluster::v2alpha::OutlierEjectionType type = - (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin) - ? envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE - : envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN; + (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin) + ? envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE + : envoy::data::cluster::v2alpha::OutlierEjectionType:: + FAILURE_PERCENTAGE_LOCAL_ORIGIN; updateDetectedEjectionStats(type); ejectHost(host_success_rate_pair.host_, type); } @@ -749,7 +750,8 @@ void EventLoggerImpl::logEject(const HostDescriptionConstSharedPtr& host, Detect event.mutable_eject_success_rate_event()->set_host_success_rate( host->outlierDetector().successRate(monitor_type)); } else if ((type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) || - (type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN)) { + (type == envoy::data::cluster::v2alpha::OutlierEjectionType:: + FAILURE_PERCENTAGE_LOCAL_ORIGIN)) { const DetectorHostMonitor::SuccessRateMonitorType monitor_type = (type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index ed6fd1ea6595..b4d8cc57341e 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -1085,12 +1085,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_gateway_failure", 100)) .WillByDefault(Return(false)); - ON_CALL(runtime_.snapshot_, - featureEnabled("outlier_detection.enforcing_success_rate", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_success_rate", 100)) .WillByDefault(Return(false)); // Now turn on FP detection. - ON_CALL(runtime_.snapshot_, - featureEnabled("outlier_detection.enforcing_failure_percentage", 0)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_failure_percentage", 0)) .WillByDefault(Return(true)); // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter // gets saturated (every 5 times). @@ -1124,8 +1122,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(10000)); EXPECT_CALL(checker_, check(hosts_[4])); EXPECT_CALL(*event_logger_, - logEject(std::static_pointer_cast(hosts_[4]), - _, + logEject(std::static_pointer_cast(hosts_[4]), _, envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE, true)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); @@ -1133,11 +1130,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { .WillByDefault(Return(1900)); interval_timer_->invokeCallback(); EXPECT_FLOAT_EQ(100.0 * (50.0 / 300.0), - hosts_[3]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + hosts_[3]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_FLOAT_EQ(100.0 * (50.0 / 350.0), - hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); // Make sure that local origin success rate monitor is not affected EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); From 074a7ccfc7e97a8057676c1c4fdc3ab093adf1c9 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 3 Sep 2019 20:38:19 +0000 Subject: [PATCH 03/12] precentage -> percentage Signed-off-by: James Forcier --- api/envoy/api/v2/cluster/outlier_detection.proto | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/envoy/api/v2/cluster/outlier_detection.proto b/api/envoy/api/v2/cluster/outlier_detection.proto index cf9c3b41f10f..60e6ee7d2cb0 100644 --- a/api/envoy/api/v2/cluster/outlier_detection.proto +++ b/api/envoy/api/v2/cluster/outlier_detection.proto @@ -121,12 +121,12 @@ message OutlierDetection { google.protobuf.UInt32Value failure_percentage_threshold = 16 [(validate.rules).uint32.lte = 100]; // The % chance that a host will be actually ejected when an outlier status is detected through - // failure precentage statistics. This setting can be used to disable ejection or to ramp it up + // failure percentage statistics. This setting can be used to disable ejection or to ramp it up // slowly. Defaults to 0. google.protobuf.UInt32Value enforcing_failure_percentage = 17 [(validate.rules).uint32.lte = 100]; // The % chance that a host will be actually ejected when an outlier status is detected through - // local-origin failure precentage statistics. This setting can be used to disable ejection or to + // local-origin failure percentage statistics. This setting can be used to disable ejection or to // ramp it up slowly. Defaults to 0. google.protobuf.UInt32Value enforcing_failure_percentage_local_origin = 18 [(validate.rules).uint32.lte = 100]; From c55e82d6e594d6cbaa1dd5822d65bd854c86cca1 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 3 Sep 2019 20:43:24 +0000 Subject: [PATCH 04/12] Add FP to spellcheck dictionary Signed-off-by: James Forcier --- tools/spelling_dictionary.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/spelling_dictionary.txt b/tools/spelling_dictionary.txt index e0ccece9cfc5..896ae4a81bcd 100644 --- a/tools/spelling_dictionary.txt +++ b/tools/spelling_dictionary.txt @@ -93,6 +93,7 @@ FDs FFFF FIN FIPS +FP FQDN FUZZER FUZZERS From 86220fb6cb96a325b37739235823fde13dd62d76 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 3 Sep 2019 21:14:03 +0000 Subject: [PATCH 05/12] Add check for non-ejected host to unit test Signed-off-by: James Forcier --- test/common/upstream/outlier_detection_impl_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index b4d8cc57341e..18eb094d4a5e 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -1142,6 +1142,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_FALSE(hosts_[3]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); From 79c3aaaae6c828537f54b64aac54bf3d64bc0fee Mon Sep 17 00:00:00 2001 From: James Forcier Date: Wed, 4 Sep 2019 17:27:01 +0000 Subject: [PATCH 06/12] Drop extra 'have' in test comment Signed-off-by: James Forcier --- test/common/upstream/outlier_detection_impl_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 18eb094d4a5e..1d963a6336f3 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -1226,7 +1226,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, false)) .Times(40); - // Cause a SR error on one host. First have 4 of the hosts have perfect SR. + // Cause a SR error on one host. First 4 of the hosts have perfect SR. loadRq(hosts_, 200, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); loadRq(hosts_[4], 200, Result::LOCAL_ORIGIN_CONNECT_FAILED); From d297566514061de4961495587712c24c608c2f28 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 5 Sep 2019 21:10:12 +0000 Subject: [PATCH 07/12] Drop 'FP' abbreviation of failure percentage. Signed-off-by: James Forcier --- .../upstream/outlier_detection_impl_test.cc | 16 +++++++++------- tools/spelling_dictionary.txt | 1 - 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 1d963a6336f3..d45aaabac7c2 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -1079,7 +1079,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { cluster_, empty_outlier_detection_, dispatcher_, runtime_, time_system_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - // Turn off 5xx detection and SR detection to test FP detection in isolation. + // Turn off 5xx detection and SR detection to test failure percentage detection in isolation. ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(false)); ON_CALL(runtime_.snapshot_, @@ -1087,7 +1087,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { .WillByDefault(Return(false)); ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_success_rate", 100)) .WillByDefault(Return(false)); - // Now turn on FP detection. + // Now turn on failure percentage detection. ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_failure_percentage", 0)) .WillByDefault(Return(true)); // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter @@ -1113,8 +1113,9 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { false)) .Times(60); - // Cause a FP error on one host. First 3 hosts have perfect FP; fourth host has failure percentage - // slightly below threshold; fifth has FP slightly above threshold. + // Cause a failure percentage error on one host. First 3 hosts have perfect failure percentage; + // fourth host has failure percentage slightly below threshold; fifth has failure percentage + // slightly above threshold. loadRq(hosts_, 50, 200); loadRq(hosts_[3], 250, 503); loadRq(hosts_[4], 300, 503); @@ -1207,14 +1208,14 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { cluster_, outlier_detection_split_, dispatcher_, runtime_, time_system_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - // Turn off 5xx detection and SR detection to test FP detection in isolation. + // Turn off 5xx detection and SR detection to test failure percentage detection in isolation. ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_local_origin_failure", 100)) .WillByDefault(Return(false)); ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_local_origin_success_rate", 100)) .WillByDefault(Return(false)); - // Now turn on FP detection. + // Now turn on failure percentage detection. ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_failure_percentage_local_origin", 0)) .WillByDefault(Return(true)); @@ -1226,7 +1227,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, false)) .Times(40); - // Cause a SR error on one host. First 4 of the hosts have perfect SR. + // Cause a failure percentage error on one host. First 4 of the hosts have perfect failure + // percentage. loadRq(hosts_, 200, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); loadRq(hosts_[4], 200, Result::LOCAL_ORIGIN_CONNECT_FAILED); diff --git a/tools/spelling_dictionary.txt b/tools/spelling_dictionary.txt index 896ae4a81bcd..e0ccece9cfc5 100644 --- a/tools/spelling_dictionary.txt +++ b/tools/spelling_dictionary.txt @@ -93,7 +93,6 @@ FDs FFFF FIN FIPS -FP FQDN FUZZER FUZZERS From 691a70b70bae2ed01e550b472a95948ce347fc1d Mon Sep 17 00:00:00 2001 From: James Forcier Date: Fri, 6 Sep 2019 14:53:30 +0000 Subject: [PATCH 08/12] Consolidate process{SuccessRate,FailurePercentage}Ejections together Signed-off-by: James Forcier --- .../common/upstream/outlier_detection_impl.cc | 87 ++++++++----------- .../common/upstream/outlier_detection_impl.h | 3 +- .../upstream/outlier_detection_impl_test.cc | 24 +++-- 3 files changed, 52 insertions(+), 62 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index fae7d111c44f..867ae3b9f148 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -585,32 +585,51 @@ void DetectorImpl::processSuccessRateEjections( "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); + uint64_t failure_percentage_minimum_hosts = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_minimum_hosts", + config_.failurePercentageMinimumHosts()); + uint64_t failure_percentage_request_volume = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_request_volume", + config_.failurePercentageRequestVolume()); + std::vector valid_success_rate_hosts; + std::vector valid_failure_percentage_hosts; double success_rate_sum = 0; // Reset the Detector's success rate mean and stdev. getSRNums(monitor_type) = {-1, -1}; // Exit early if there are not enough hosts. - if (host_monitors_.size() < success_rate_minimum_hosts) { + if (host_monitors_.size() < success_rate_minimum_hosts && + host_monitors_.size() < failure_percentage_minimum_hosts) { return; } // reserve upper bound of vector size to avoid reallocation. valid_success_rate_hosts.reserve(host_monitors_.size()); + valid_failure_percentage_hosts.reserve(host_monitors_.size()); for (const auto& host : host_monitors_) { // Don't do work if the host is already ejected. if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - absl::optional host_success_rate = host.second->getSRMonitor(monitor_type) - .successRateAccumulator() - .getSuccessRate(success_rate_request_volume); - - if (host_success_rate) { - valid_success_rate_hosts.emplace_back( - HostSuccessRatePair(host.first, host_success_rate.value())); - success_rate_sum += host_success_rate.value(); - host.second->successRate(monitor_type, host_success_rate.value()); + std::pair host_success_rate_and_volume = + host.second->getSRMonitor(monitor_type) + .successRateAccumulator() + .getSuccessRateAndVolume(); + double success_rate = host_success_rate_and_volume.first; + double request_volume = host_success_rate_and_volume.second; + + if (request_volume >= + std::min(success_rate_request_volume, failure_percentage_request_volume)) { + host.second->successRate(monitor_type, success_rate); + } + + if (request_volume >= success_rate_request_volume) { + valid_success_rate_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate)); + success_rate_sum += success_rate; + } + if (request_volume >= failure_percentage_request_volume) { + valid_failure_percentage_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate)); } } } @@ -636,39 +655,6 @@ void DetectorImpl::processSuccessRateEjections( } } } -} - -void DetectorImpl::processFailurePercentageEjections( - DetectorHostMonitor::SuccessRateMonitorType monitor_type) { - uint64_t failure_percentage_minimum_hosts = - runtime_.snapshot().getInteger("outlier_detection.failure_percentage_minimum_hosts", - config_.failurePercentageMinimumHosts()); - uint64_t failure_percentage_request_volume = - runtime_.snapshot().getInteger("outlier_detection.failure_percentage_request_volume", - config_.failurePercentageRequestVolume()); - std::vector valid_failure_percentage_hosts; - - // Exit early if there are not enough hosts. - if (host_monitors_.size() < failure_percentage_minimum_hosts) { - return; - } - - // reserve upper bound of vector size to avoid reallocation. - valid_failure_percentage_hosts.reserve(host_monitors_.size()); - - for (const auto& host : host_monitors_) { - if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - absl::optional host_success_rate = - host.second->getSRMonitor(monitor_type) - .successRateAccumulator() - .getSuccessRate(failure_percentage_request_volume); - - if (host_success_rate) { - valid_failure_percentage_hosts.emplace_back( - HostSuccessRatePair(host.first, host_success_rate.value())); - } - } - } if (!valid_failure_percentage_hosts.empty() && valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) { @@ -712,8 +698,8 @@ void DetectorImpl::onIntervalTimer() { processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); - processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); - processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); + // processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); + // processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); armIntervalTimer(); } @@ -805,14 +791,11 @@ SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() { return current_success_rate_bucket_.get(); } -absl::optional -SuccessRateAccumulator::getSuccessRate(uint64_t success_rate_request_volume) { - if (backup_success_rate_bucket_->total_request_counter_ < success_rate_request_volume) { - return {}; - } +std::pair SuccessRateAccumulator::getSuccessRateAndVolume() { + double success_rate = backup_success_rate_bucket_->success_request_counter_ * 100.0 / + backup_success_rate_bucket_->total_request_counter_; - return {backup_success_rate_bucket_->success_request_counter_ * 100.0 / - backup_success_rate_bucket_->total_request_counter_}; + return {success_rate, backup_success_rate_bucket_->total_request_counter_}; } } // namespace Outlier diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index cc1860f05c5a..06d4b32cae11 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -92,7 +92,7 @@ class SuccessRateAccumulator { * @return a valid absl::optional with the success rate. If there were not enough * requests, an invalid absl::optional is returned. */ - absl::optional getSuccessRate(uint64_t success_rate_request_volume); + std::pair getSuccessRateAndVolume(); private: std::unique_ptr current_success_rate_bucket_; @@ -364,7 +364,6 @@ class DetectorImpl : public Detector, public std::enable_shared_from_thisinvokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateAverage( DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( @@ -1040,9 +1042,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRateLocalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( @@ -1184,9 +1188,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateAverage( DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( @@ -1297,9 +1303,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( From 7bdccce53fa74fbcdfe3e921498498c497da22e9 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Fri, 6 Sep 2019 16:45:26 +0000 Subject: [PATCH 09/12] Fix divide by zero and remove stale comment Signed-off-by: James Forcier --- .../common/upstream/outlier_detection_impl.cc | 21 ++++++++++++------- .../common/upstream/outlier_detection_impl.h | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 867ae3b9f148..3c1dfad9a022 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -612,12 +612,16 @@ void DetectorImpl::processSuccessRateEjections( for (const auto& host : host_monitors_) { // Don't do work if the host is already ejected. if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - std::pair host_success_rate_and_volume = + absl::optional> host_success_rate_and_volume = host.second->getSRMonitor(monitor_type) .successRateAccumulator() .getSuccessRateAndVolume(); - double success_rate = host_success_rate_and_volume.first; - double request_volume = host_success_rate_and_volume.second; + + if (!host_success_rate_and_volume) { + continue; + } + double success_rate = host_success_rate_and_volume.value().first; + double request_volume = host_success_rate_and_volume.value().second; if (request_volume >= std::min(success_rate_request_volume, failure_percentage_request_volume)) { @@ -698,9 +702,6 @@ void DetectorImpl::onIntervalTimer() { processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); - // processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin); - // processFailurePercentageEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin); - armIntervalTimer(); } @@ -791,11 +792,15 @@ SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() { return current_success_rate_bucket_.get(); } -std::pair SuccessRateAccumulator::getSuccessRateAndVolume() { +absl::optional> SuccessRateAccumulator::getSuccessRateAndVolume() { + if (!backup_success_rate_bucket_->total_request_counter_) { + return absl::nullopt; + } + double success_rate = backup_success_rate_bucket_->success_request_counter_ * 100.0 / backup_success_rate_bucket_->total_request_counter_; - return {success_rate, backup_success_rate_bucket_->total_request_counter_}; + return {{success_rate, backup_success_rate_bucket_->total_request_counter_}}; } } // namespace Outlier diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 06d4b32cae11..633cd52f16cb 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -92,7 +92,7 @@ class SuccessRateAccumulator { * @return a valid absl::optional with the success rate. If there were not enough * requests, an invalid absl::optional is returned. */ - std::pair getSuccessRateAndVolume(); + absl::optional> getSuccessRateAndVolume(); private: std::unique_ptr current_success_rate_bucket_; From 61cfcaf2a060522d5e88939a67ea594ba91df2d3 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Fri, 6 Sep 2019 19:33:53 +0000 Subject: [PATCH 10/12] Correctly compute failure_percentage_threshold Signed-off-by: James Forcier --- source/common/upstream/outlier_detection_impl.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 3c1dfad9a022..436a2ef28633 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -662,10 +662,8 @@ void DetectorImpl::processSuccessRateEjections( if (!valid_failure_percentage_hosts.empty() && valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) { - const double failure_percentage_threshold = - runtime_.snapshot().getInteger("outlier_detection.failure_percentage_threshold", - config_.failurePercentageThreshold()) / - 100.0; + const double failure_percentage_threshold = runtime_.snapshot().getInteger( + "outlier_detection.failure_percentage_threshold", config_.failurePercentageThreshold()); for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) { if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) { From c48506d8edc9fc8638311719eac3a3d1b17953bd Mon Sep 17 00:00:00 2001 From: James Forcier Date: Mon, 9 Sep 2019 17:53:09 +0000 Subject: [PATCH 11/12] Add testing for new logging path Signed-off-by: James Forcier --- .../upstream/outlier_detection_impl_test.cc | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 35a32502f1df..1df588abbc6f 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -1603,6 +1603,36 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { .WillOnce(SaveArg<0>(&log4)); event_logger.logUneject(host); Json::Factory::loadFromString(log4); + + StringViewSaver log5; + EXPECT_CALL(host->outlier_detector_, lastUnejectionTime()).WillOnce(ReturnRef(monotonic_time)); + EXPECT_CALL(host->outlier_detector_, + successRate(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)) + .WillOnce(Return(0)); + EXPECT_CALL(*file, + write(absl::string_view( + "{\"type\":\"FAILURE_PERCENTAGE\",\"cluster_name\":\"fake_cluster\"," + "\"upstream_url\":\"10.0.0.1:443\",\"action\":\"EJECT\"," + "\"num_ejections\":0,\"enforced\":false,\"eject_failure_percentage_event\":{" + "\"host_success_rate\":0},\"timestamp\":\"2018-12-18T09:00:00Z\"," + "\"secs_since_last_action\":\"30\"}\n"))) + .WillOnce(SaveArg<0>(&log5)); + event_logger.logEject(host, detector, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE, + false); + Json::Factory::loadFromString(log5); + + StringViewSaver log6; + EXPECT_CALL(host->outlier_detector_, lastEjectionTime()).WillOnce(ReturnRef(monotonic_time)); + EXPECT_CALL(*file, + write(absl::string_view( + "{\"type\":\"CONSECUTIVE_5XX\",\"cluster_name\":\"fake_cluster\"," + "\"upstream_url\":\"10.0.0.1:443\",\"action\":\"UNEJECT\"," + "\"num_ejections\":0,\"enforced\":false,\"timestamp\":\"2018-12-18T09:00:00Z\"," + "\"secs_since_last_action\":\"30\"}\n"))) + .WillOnce(SaveArg<0>(&log6)); + event_logger.logUneject(host); + Json::Factory::loadFromString(log6); } TEST(OutlierUtility, SRThreshold) { From 6b68a74053e8b401ed44557db5e567acaba5d454 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Mon, 9 Sep 2019 17:59:36 +0000 Subject: [PATCH 12/12] Add comment about cleanup in v4 to enforcing_failure_percentage option. Signed-off-by: James Forcier --- api/envoy/api/v2/cluster/outlier_detection.proto | 3 +++ 1 file changed, 3 insertions(+) diff --git a/api/envoy/api/v2/cluster/outlier_detection.proto b/api/envoy/api/v2/cluster/outlier_detection.proto index 60e6ee7d2cb0..36f8831057de 100644 --- a/api/envoy/api/v2/cluster/outlier_detection.proto +++ b/api/envoy/api/v2/cluster/outlier_detection.proto @@ -123,6 +123,9 @@ message OutlierDetection { // The % chance that a host will be actually ejected when an outlier status is detected through // failure percentage statistics. This setting can be used to disable ejection or to ramp it up // slowly. Defaults to 0. + // + // [#next-major-version: setting this without setting failure_percentage_threshold should be + // invalid in v4.] google.protobuf.UInt32Value enforcing_failure_percentage = 17 [(validate.rules).uint32.lte = 100]; // The % chance that a host will be actually ejected when an outlier status is detected through