-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added histogram latency information to Hystrix dashboard stream #3986
Changes from 8 commits
92f2a12
5aec25a
a37a9a5
1e8b632
7b03da0
87b2d37
af829b3
9bb6ba2
bd40d1a
b929103
04fae8a
94929e5
eb95fdf
34afb7b
e780519
e30818c
9bbc77c
cf3adc9
3d6395a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,14 +10,17 @@ | |
#include "common/http/headers.h" | ||
|
||
#include "absl/strings/str_cat.h" | ||
#include "absl/strings/str_split.h" | ||
#include "fmt/printf.h" | ||
|
||
namespace Envoy { | ||
namespace Extensions { | ||
namespace StatSinks { | ||
namespace Hystrix { | ||
|
||
const uint64_t HystrixSink::DEFAULT_NUM_BUCKETS; | ||
|
||
static const std::vector<double> hystrix_quantiles = {0, 0.25, 0.5, 0.75, 0.90, | ||
0.95, 0.99, 0.995, 1}; | ||
ClusterStatsCache::ClusterStatsCache(const std::string& cluster_name) | ||
: cluster_name_(cluster_name) {} | ||
|
||
|
@@ -41,6 +44,18 @@ void ClusterStatsCache::printRollingWindow(absl::string_view name, RollingWindow | |
out_str << std::endl; | ||
} | ||
|
||
void HystrixSink::addHistogramToStream(QuantileLatencyMap latency_map, absl::string_view key, | ||
std::stringstream& ss) { | ||
ss << ", \"" << key << "\": {"; | ||
bool is_first = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems a place where a join operation would make sense, but I can see that there is already a stylized approach to building these strings in this code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want me to consider changing the code to use join? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Up to you, TODO is fine as well. |
||
for (const std::pair<double, double> element : latency_map) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. const std::pair<double, double>& |
||
std::string quantile = fmt::sprintf("%g", element.first * 100); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: |
||
HystrixSink::addDoubleToStream(quantile, element.second, ss, is_first); | ||
is_first = false; | ||
} | ||
ss << "}"; | ||
} | ||
|
||
// Add new value to rolling window, in place of oldest one. | ||
void HystrixSink::pushNewValue(RollingWindow& rolling_window, uint64_t value) { | ||
if (rolling_window.empty()) { | ||
|
@@ -116,6 +131,11 @@ void HystrixSink::addIntToStream(absl::string_view key, uint64_t value, std::str | |
addInfoToStream(key, std::to_string(value), info, is_first); | ||
} | ||
|
||
void HystrixSink::addDoubleToStream(absl::string_view key, double value, std::stringstream& info, | ||
bool is_first) { | ||
addInfoToStream(key, std::to_string(value), info, is_first); | ||
} | ||
|
||
void HystrixSink::addInfoToStream(absl::string_view key, absl::string_view value, | ||
std::stringstream& info, bool is_first) { | ||
if (!is_first) { | ||
|
@@ -129,7 +149,7 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
absl::string_view cluster_name, | ||
uint64_t max_concurrent_requests, uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, | ||
std::stringstream& ss) { | ||
QuantileLatencyMap& histogram, std::stringstream& ss) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
|
||
std::time_t currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); | ||
|
||
|
@@ -159,7 +179,7 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
addIntToStream("rollingCountResponsesFromCache", 0, ss); | ||
|
||
// Envoy's "circuit breaker" has similar meaning to hystrix's isolation | ||
// so we count upstream_rq_pending_overflow and present it as ss | ||
// so we count upstream_rq_pending_overflow and present it as rollingCountSemaphoreRejected | ||
addIntToStream("rollingCountSemaphoreRejected", rejected, ss); | ||
|
||
// Hystrix's short circuit is not similar to Envoy's since it is triggered by 503 responses | ||
|
@@ -171,12 +191,8 @@ void HystrixSink::addHystrixCommand(ClusterStatsCache& cluster_stats_cache, | |
addIntToStream("rollingCountTimeout", timeouts, ss); | ||
addIntToStream("rollingCountBadRequests", 0, ss); | ||
addIntToStream("currentConcurrentExecutionCount", 0, ss); | ||
addIntToStream("latencyExecute_mean", 0, ss); | ||
|
||
// TODO trabetti : add histogram information once available by PR #2932 | ||
addInfoToStream( | ||
"latencyExecute", | ||
"{\"0\":0,\"25\":0,\"50\":0,\"75\":0,\"90\":0,\"95\":0,\"99\":0,\"99.5\":0,\"100\":0}", ss); | ||
addStringToStream("latencyExecute_mean", "null", ss); | ||
addHistogramToStream(histogram, "latencyExecute", ss); | ||
addIntToStream("propertyValue_circuitBreakerRequestVolumeThreshold", 0, ss); | ||
addIntToStream("propertyValue_circuitBreakerSleepWindowInMilliseconds", 0, ss); | ||
addIntToStream("propertyValue_circuitBreakerErrorThresholdPercentage", 0, ss); | ||
|
@@ -228,10 +244,10 @@ void HystrixSink::addClusterStatsToStream(ClusterStatsCache& cluster_stats_cache | |
uint64_t max_concurrent_requests, | ||
uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, | ||
std::stringstream& ss) { | ||
QuantileLatencyMap& histogram, std::stringstream& ss) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we make this |
||
|
||
addHystrixCommand(cluster_stats_cache, cluster_name, max_concurrent_requests, reporting_hosts, | ||
rolling_window_ms, ss); | ||
rolling_window_ms, histogram, ss); | ||
addHystrixThreadPool(cluster_name, max_concurrent_requests, reporting_hosts, rolling_window_ms, | ||
ss); | ||
} | ||
|
@@ -297,13 +313,34 @@ Http::Code HystrixSink::handlerHystrixEventStream(absl::string_view, | |
return Http::Code::OK; | ||
} | ||
|
||
void HystrixSink::flush(Stats::Source&) { | ||
void HystrixSink::flush(Stats::Source& source) { | ||
if (callbacks_list_.empty()) { | ||
return; | ||
} | ||
incCounter(); | ||
std::stringstream ss; | ||
Upstream::ClusterManager::ClusterInfoMap clusters = server_.clusterManager().clusters(); | ||
|
||
// Save a map of the relevant histograms per cluster in a convenient format. | ||
std::unordered_map<std::string, QuantileLatencyMap> time_histograms; | ||
for (const Stats::ParentHistogramSharedPtr histogram : source.cachedHistograms()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: should this be |
||
// histogram->name() on clusters of the format "cluster.cluster_name.histogram_name" | ||
// i.e. "cluster.service1.upstream_rq_time". | ||
const std::vector<std::string> split_name = absl::StrSplit(histogram->name(), '.'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can still be a vector of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. took your suggestion to use tags, removed this vector. |
||
if (split_name.size() >= 2 && split_name[0] == "cluster" && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just realized that you're essentially repeating work already done by Envoy. Rather than manually parsing the string, you could check the tag and tag_extracted_name since Envoy parses those for you. It may be a bit more code, but I think it's a bit safer since the regexes are a little more specific than splitting on
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if we should log if a cluster_name tag is not found. What could be the reason? Anything the user can do about it? If not, I think we can just verify it is not histogram.tags().end(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would imply that something is broken since we expect that this should never happen. I wouldn't say it's a catastrophic failure, so it's not a crashable event, but logging an error to the console might catch someone's attention so that they could file a bug against Envoy. I'm not super opinionated, so it's ultimately up to you. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO this is what ASSERT is for. It should stop tests from passing so you can investigate. I suspect a log would just slow things down and no one would notice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for ASSERT There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You are looking at split_name[2] so if size==2 that's a crasher. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed the code to use tags instead of this split, following @mrice32's suggestion |
||
split_name[2] == "upstream_rq_time") { | ||
QuantileLatencyMap& hist_map = time_histograms[split_name[1]]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we only expect to do this emplacement once per flush? What happens if you get multiple histograms that happen to satisfy your format checks and produce the same cluster name on the same flush (because of the dynamic nature of stats strings, things like this are always possible :) )? Won't the second overwrite the first? How do you want to handle that case? If you want to do something other than overwrite, this case can be checked pretty easily by using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there's no problem with overwriting. We want to show latency information in dashboard, it doesn't have to be exact data. If there is an extreme value (very high latency) it could be either a temporary situation that has already recovered, or a consistent problem the user wants to find and fix. In the case it was temporary, if we missed it because of the overwrite, it doesn't matter. If it is consistent, user will see it on next flush. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. I think overwriting is fine. However, it seems like the code assumes that each cluster should only have one histogram that matches this filter (a pretty good assumption IMO). If there's another histogram that somehow gets inside the if statement, it might be worthwhile to log an error because it means that something is probably broken here. WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no problem, I can log in both cases, can you point me to a similar example? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assert here too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep! |
||
const std::vector<double>& supported_quantiles = | ||
histogram->cumulativeStatistics().supportedQuantiles(); | ||
for (size_t i = 0; i < supported_quantiles.size(); ++i) { | ||
if (std::find(hystrix_quantiles.begin(), hystrix_quantiles.end(), supported_quantiles[i]) != | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is hystrix_quantiles sorted? If so, binary-search (via std::lower_bound and an equality-test) might be faster, but it's probably a small enough N that it isn't a big deal I guess. Still it's worth a comment that this is a deliberate choice. E.g.
|
||
hystrix_quantiles.end()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should there be handling for a case where the find() fails? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't it suppose to return hystrix_quantiles.end() if it fails? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course, but is that expected? Should you handle that case? |
||
hist_map[supported_quantiles[i]] = histogram->intervalStatistics().computedQuantiles()[i]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
for (auto& cluster : clusters) { | ||
Upstream::ClusterInfoConstSharedPtr cluster_info = cluster.second.get().info(); | ||
|
||
|
@@ -321,7 +358,7 @@ void HystrixSink::flush(Stats::Source&) { | |
*cluster_stats_cache_ptr, cluster_info->name(), | ||
cluster_info->resourceManager(Upstream::ResourcePriority::Default).pendingRequests().max(), | ||
cluster_info->statsScope().gauge("membership_total").value(), server_.statsFlushInterval(), | ||
ss); | ||
time_histograms[cluster_info->name()], ss); | ||
} | ||
|
||
Buffer::OwnedImpl data; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,8 @@ namespace Hystrix { | |
typedef std::vector<uint64_t> RollingWindow; | ||
typedef std::map<const std::string, RollingWindow> RollingStatsMap; | ||
|
||
using QuantileLatencyMap = std::unordered_map<double, double>; | ||
|
||
struct { | ||
const std::string AllowHeadersHystrix{"Accept, Cache-Control, X-Requested-With, Last-Event-ID"}; | ||
} AccessControlAllowHeadersValue; | ||
|
@@ -26,6 +28,7 @@ struct ClusterStatsCache { | |
void printToStream(std::stringstream& out_str); | ||
void printRollingWindow(absl::string_view name, RollingWindow rolling_window, | ||
std::stringstream& out_str); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line? |
||
std::string cluster_name_; | ||
|
||
// Rolling windows | ||
|
@@ -72,7 +75,8 @@ class HystrixSink : public Stats::Sink, public Logger::Loggable<Logger::Id::hyst | |
void addClusterStatsToStream(ClusterStatsCache& cluster_stats_cache, | ||
absl::string_view cluster_name, uint64_t max_concurrent_requests, | ||
uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, std::stringstream& ss); | ||
std::chrono::milliseconds rolling_window_ms, | ||
QuantileLatencyMap& histogram, std::stringstream& ss); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we make this |
||
|
||
/** | ||
* Calculate values needed to create the stream and write into the map. | ||
|
@@ -94,33 +98,44 @@ class HystrixSink : public Stats::Sink, public Logger::Loggable<Logger::Id::hyst | |
*/ | ||
uint64_t getRollingValue(RollingWindow rolling_window); | ||
|
||
private: | ||
/** | ||
* Format the given key and absl::string_view value to "key"="value", and adding to the | ||
* Format the given key and value to "key"=value, and adding to the stringstream. | ||
*/ | ||
static void addInfoToStream(absl::string_view key, absl::string_view value, | ||
std::stringstream& info, bool is_first = false); | ||
|
||
/** | ||
* Format the given key and double value to "key"=<string of uint64_t>, and adding to the | ||
* stringstream. | ||
*/ | ||
void addStringToStream(absl::string_view key, absl::string_view value, std::stringstream& info, | ||
bool is_first = false); | ||
static void addDoubleToStream(absl::string_view key, double value, std::stringstream& info, | ||
bool is_first); | ||
|
||
/** | ||
* Format the given key and uint64_t value to "key"=<string of uint64_t>, and adding to the | ||
* Format the given key and absl::string_view value to "key"="value", and adding to the | ||
* stringstream. | ||
*/ | ||
void addIntToStream(absl::string_view key, uint64_t value, std::stringstream& info, | ||
bool is_first = false); | ||
static void addStringToStream(absl::string_view key, absl::string_view value, | ||
std::stringstream& info, bool is_first = false); | ||
|
||
/** | ||
* Format the given key and value to "key"=value, and adding to the stringstream. | ||
* Format the given key and uint64_t value to "key"=<string of uint64_t>, and adding to the | ||
* stringstream. | ||
*/ | ||
void addInfoToStream(absl::string_view key, absl::string_view value, std::stringstream& info, | ||
bool is_first = false); | ||
static void addIntToStream(absl::string_view key, uint64_t value, std::stringstream& info, | ||
bool is_first = false); | ||
|
||
static void addHistogramToStream(QuantileLatencyMap latency_map, absl::string_view key, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. const QuantileLatencyMap& |
||
std::stringstream& ss); | ||
|
||
private: | ||
/** | ||
* Generate HystrixCommand event stream. | ||
*/ | ||
void addHystrixCommand(ClusterStatsCache& cluster_stats_cache, absl::string_view cluster_name, | ||
uint64_t max_concurrent_requests, uint64_t reporting_hosts, | ||
std::chrono::milliseconds rolling_window_ms, std::stringstream& ss); | ||
std::chrono::milliseconds rolling_window_ms, QuantileLatencyMap& histogram, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
std::stringstream& ss); | ||
|
||
/** | ||
* Generate HystrixThreadPool event stream. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const QuantileLatencyMap&