-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added histogram latency information to Hystrix dashboard stream #3986
Changes from 17 commits
92f2a12
5aec25a
a37a9a5
1e8b632
7b03da0
87b2d37
af829b3
9bb6ba2
bd40d1a
b929103
04fae8a
94929e5
eb95fdf
34afb7b
e780519
e30818c
9bbc77c
cf3adc9
3d6395a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,17 +9,19 @@ | |
|
||
#include "common/buffer/buffer_impl.h" | ||
#include "common/common/logger.h" | ||
#include "common/config/well_known_names.h" | ||
#include "common/http/headers.h" | ||
|
||
#include "absl/strings/str_cat.h" | ||
#include "absl/strings/str_split.h" | ||
#include "fmt/printf.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace StatSinks { | ||
namespace Hystrix { | ||
|
||
const uint64_t HystrixSink::DEFAULT_NUM_BUCKETS; | ||
|
||
ClusterStatsCache::ClusterStatsCache(const std::string& cluster_name) | ||
: cluster_name_(cluster_name) {} | ||
|
||
|
@@ -43,6 +45,18 @@ void ClusterStatsCache::printRollingWindow(absl::string_view name, RollingWindow | |
out_str << std::endl; | ||
} | ||
|
||
void HystrixSink::addHistogramToStream(const QuantileLatencyMap& latency_map, absl::string_view key, | ||
std::stringstream& ss) { | ||
ss << ", \"" << key << "\": {"; | ||
bool is_first = true; | ||
for (const std::pair<double, double>& element : latency_map) { | ||
std::string quantile = fmt::sprintf("%g", element.first * 100); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: |
||
HystrixSink::addDoubleToStream(quantile, element.second, ss, is_first); | ||
is_first = false; | ||
} | ||
ss << "}"; | ||
} | ||
|
||
// Add new value to rolling window, in place of oldest one. | ||
void HystrixSink::pushNewValue(RollingWindow& rolling_window, uint64_t value) { | ||
if (rolling_window.empty()) { | ||
|
@@ -118,6 +132,11 @@ void HystrixSink::addIntToStream(absl::string_view key, uint64_t value, std::str | |
addInfoToStream(key, std::to_string(value), info, is_first); | ||
} | ||
|
||
void HystrixSink::addDoubleToStream(absl::string_view key, double value, std::stringstream& info, | ||
bool is_first) { | ||
addInfoToStream(key, std::to_string(value), info, is_first); | ||
} | ||
|
||
void HystrixSink::addInfoToStream(absl::string_view key, absl::string_view value, | ||
std::stringstream& info, bool is_first) { | ||
if (!is_first) { | ||
|
@@ -131,7 +150,7 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
absl::string_view cluster_name, | ||
uint64_t max_concurrent_requests, uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, | ||
std::stringstream& ss) { | ||
const QuantileLatencyMap& histogram, std::stringstream& ss) { | ||
|
||
std::time_t currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); | ||
|
||
|
@@ -161,7 +180,7 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
addIntToStream("rollingCountResponsesFromCache", 0, ss); | ||
|
||
// Envoy's "circuit breaker" has similar meaning to hystrix's isolation | ||
// so we count upstream_rq_pending_overflow and present it as ss | ||
// so we count upstream_rq_pending_overflow and present it as rollingCountSemaphoreRejected | ||
addIntToStream("rollingCountSemaphoreRejected", rejected, ss); | ||
|
||
// Hystrix's short circuit is not similar to Envoy's since it is triggered by 503 responses | ||
|
@@ -173,12 +192,8 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
addIntToStream("rollingCountTimeout", timeouts, ss); | ||
addIntToStream("rollingCountBadRequests", 0, ss); | ||
addIntToStream("currentConcurrentExecutionCount", 0, ss); | ||
addIntToStream("latencyExecute_mean", 0, ss); | ||
|
||
// TODO trabetti : add histogram information once available by PR #2932 | ||
addInfoToStream( | ||
"latencyExecute", | ||
"{\"0\":0,\"25\":0,\"50\":0,\"75\":0,\"90\":0,\"95\":0,\"99\":0,\"99.5\":0,\"100\":0}", ss); | ||
addStringToStream("latencyExecute_mean", "null", ss); | ||
addHistogramToStream(histogram, "latencyExecute", ss); | ||
addIntToStream("propertyValue_circuitBreakerRequestVolumeThreshold", 0, ss); | ||
addIntToStream("propertyValue_circuitBreakerSleepWindowInMilliseconds", 0, ss); | ||
addIntToStream("propertyValue_circuitBreakerErrorThresholdPercentage", 0, ss); | ||
|
@@ -230,10 +245,11 @@ void HystrixSink::addClusterStatsToStream(ClusterStatsCache& cluster_stats_cache | |
uint64_t max_concurrent_requests, | ||
uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, | ||
const QuantileLatencyMap& histogram, | ||
std::stringstream& ss) { | ||
|
||
addHystrixCommand(cluster_stats_cache, cluster_name, max_concurrent_requests, reporting_hosts, | ||
rolling_window_ms, ss); | ||
rolling_window_ms, histogram, ss); | ||
addHystrixThreadPool(cluster_name, max_concurrent_requests, reporting_hosts, rolling_window_ms, | ||
ss); | ||
} | ||
|
@@ -299,13 +315,46 @@ Http::Code HystrixSink::handlerHystrixEventStream(absl::string_view, | |
return Http::Code::OK; | ||
} | ||
|
||
void HystrixSink::flush(Stats::Source&) { | ||
void HystrixSink::flush(Stats::Source& source) { | ||
if (callbacks_list_.empty()) { | ||
return; | ||
} | ||
incCounter(); | ||
std::stringstream ss; | ||
Upstream::ClusterManager::ClusterInfoMap clusters = server_.clusterManager().clusters(); | ||
|
||
// Save a map of the relevant histograms per cluster in a convenient format. | ||
std::unordered_map<std::string, QuantileLatencyMap> time_histograms; | ||
for (const Stats::ParentHistogramSharedPtr histogram : source.cachedHistograms()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: should this be |
||
if (histogram->tagExtractedName() == "cluster.upstream_rq_time") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we be hardcoding strings like this? @mrice32 do we have a better way of handling this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did not find anything searching the code |
||
// TODO(mrice32): add an Envoy utility function to look up and return a tag for a metric. | ||
auto it = std::find_if(histogram->tags().begin(), histogram->tags().end(), | ||
[](const Stats::Tag& tag) { | ||
return (tag.name_ == Config::TagNames::get().CLUSTER_NAME); | ||
}); | ||
|
||
// Make sure we found the cluster name tag | ||
ASSERT(it != histogram->tags().end()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this guaranteed? I.e. is this truly an invariant, or should we have error handling behavior? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we discussed it here #3986 (comment) |
||
auto it_bool_pair = time_histograms.emplace(std::make_pair(it->value_, QuantileLatencyMap())); | ||
// Make sure histogram with this name was not already added | ||
ASSERT(it_bool_pair.second); | ||
QuantileLatencyMap& hist_map = it_bool_pair.first->second; | ||
|
||
const std::vector<double>& supported_quantiles = | ||
histogram->intervalStatistics().supportedQuantiles(); | ||
for (size_t i = 0; i < supported_quantiles.size(); ++i) { | ||
// binary-search here is likely not worth it, as hystrix_quantiles has <10 elements. | ||
if (std::find(hystrix_quantiles.begin(), hystrix_quantiles.end(), supported_quantiles[i]) != | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is hystrix_quantiles sorted? If so, binary-search (via std::lower_bound and an equality-test) might be faster, but it's probably a small enough N that it isn't a big deal I guess. Still it's worth a comment that this is a deliberate choice. E.g.
|
||
hystrix_quantiles.end()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should there be handling for a case where the find() fails? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't it suppose to return hystrix_quantiles.end() if it fails? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course, but is that expected? Should you handle that case? |
||
double value = histogram->intervalStatistics().computedQuantiles()[i]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: |
||
if (!std::isnan(value)) { | ||
hist_map[supported_quantiles[i]] = value; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
for (auto& cluster : clusters) { | ||
Upstream::ClusterInfoConstSharedPtr cluster_info = cluster.second.get().info(); | ||
|
||
|
@@ -323,7 +372,7 @@ void HystrixSink::flush(Stats::Source&) { | |
*cluster_stats_cache_ptr, cluster_info->name(), | ||
cluster_info->resourceManager(Upstream::ResourcePriority::Default).pendingRequests().max(), | ||
cluster_info->statsScope().gauge("membership_total").value(), server_.statsFlushInterval(), | ||
ss); | ||
time_histograms[cluster_info->name()], ss); | ||
} | ||
|
||
Buffer::OwnedImpl data; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ | |
#include "test/mocks/upstream/mocks.h" | ||
|
||
#include "absl/strings/str_split.h" | ||
#include "circllhist.h" | ||
#include "fmt/printf.h" | ||
#include "gmock/gmock.h" | ||
#include "gtest/gtest.h" | ||
|
||
|
@@ -19,6 +21,7 @@ using testing::InSequence; | |
using testing::Invoke; | ||
using testing::NiceMock; | ||
using testing::Return; | ||
using testing::ReturnPointee; | ||
using testing::ReturnRef; | ||
|
||
namespace Envoy { | ||
|
@@ -209,6 +212,14 @@ class HystrixSinkTest : public testing::Test { | |
return cluster_message_map; | ||
} | ||
|
||
histogram_t* makeHistogram(const std::vector<uint64_t>& values) { | ||
histogram_t* histogram = hist_alloc(); // The histogram needs to be freed later | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we have a RAII wrapper for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it something like
and
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes; if you use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't, but I am tempted to use it in the test I borrowed from, thread_local_store_test. |
||
for (uint64_t value : values) { | ||
hist_insert_intscale(histogram, value, 0, 1); | ||
} | ||
return histogram; | ||
} | ||
|
||
TestRandomGenerator rand_; | ||
uint64_t window_size_ = rand_.random() % 10 + 5; // Arbitrary reasonable number. | ||
const std::string cluster1_name_{"test_cluster1"}; | ||
|
@@ -432,6 +443,61 @@ TEST_F(HystrixSinkTest, AddAndRemoveClusters) { | |
validateResults(cluster_message_map[cluster2_name_], 0, 0, 0, 0, 0, window_size_); | ||
} | ||
|
||
TEST_F(HystrixSinkTest, HistogramTest) { | ||
InSequence s; | ||
std::vector<Stats::ParentHistogramSharedPtr> stored_histograms; | ||
|
||
// Create histogram for the Hystrix sink to read. | ||
auto histogram = std::make_shared<NiceMock<Stats::MockParentHistogram>>(); | ||
histogram->name_ = "cluster." + cluster1_name_ + ".upstream_rq_time"; | ||
const std::string tag_extracted_name = "cluster.upstream_rq_time"; | ||
ON_CALL(*histogram, tagExtractedName()) | ||
.WillByDefault(testing::ReturnRefOfCopy(tag_extracted_name)); | ||
std::vector<Stats::Tag> tags; | ||
Stats::Tag tag = { | ||
Config::TagNames::get().CLUSTER_NAME, // name_ | ||
cluster1_name_ // value_ | ||
}; | ||
tags.emplace_back(tag); | ||
ON_CALL(*histogram, tags()).WillByDefault(testing::ReturnRef(tags)); | ||
|
||
histogram->used_ = true; | ||
|
||
// Init with data such that the quantile value is equal to the quantile. | ||
std::vector<uint64_t> h1_interval_values; | ||
for (size_t i = 0; i < 100; ++i) { | ||
h1_interval_values.push_back(i); | ||
} | ||
|
||
histogram_t* hist1_interval = makeHistogram(h1_interval_values); | ||
Stats::HistogramStatisticsImpl h1_interval_statistics(hist1_interval); | ||
ON_CALL(*histogram, intervalStatistics()) | ||
.WillByDefault(testing::ReturnRef(h1_interval_statistics)); | ||
stored_histograms.push_back(histogram); | ||
|
||
ON_CALL(source_, cachedHistograms()).WillByDefault(ReturnPointee(&stored_histograms)); | ||
|
||
Buffer::OwnedImpl buffer = createClusterAndCallbacks(); | ||
// Register callback to sink. | ||
sink_->registerConnection(&callbacks_); | ||
sink_->flush(source_); | ||
|
||
std::unordered_map<std::string, std::string> cluster_message_map = | ||
buildClusterMap(buffer.toString()); | ||
|
||
Json::ObjectSharedPtr latency = Json::Factory::loadFromString(cluster_message_map[cluster1_name_]) | ||
->getObject("latencyExecute"); | ||
|
||
// Data was added such that the value equals the quantile: | ||
// "latencyExecute": {"99.5": 99.500000, "95": 95.000000, "90": 90.000000, "100": 100.000000, "0": | ||
// 0.000000, "25": 25.000000, "99": 99.000000, "50": 50.000000, "75": 75.000000}. | ||
for (const double quantile : hystrix_quantiles) { | ||
EXPECT_EQ(quantile * 100, latency->getDouble(fmt::sprintf("%g", quantile * 100))); | ||
} | ||
|
||
hist_free(hist1_interval); | ||
} | ||
|
||
TEST_F(HystrixSinkTest, HystrixEventStreamHandler) { | ||
InSequence s; | ||
Buffer::OwnedImpl buffer = createClusterAndCallbacks(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems a place where a join operation would make sense, but I can see that there is already a stylized approach to building these strings in this code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you want me to consider changing the code to use join?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Up to you, TODO is fine as well.