-
Notifications
You must be signed in to change notification settings - Fork 6.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix HostResolver behavior on fail #62652
Changes from 6 commits
08a2d19
87785e1
22f1c19
58c53fa
d8856b0
e9d2b54
8a24515
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ | |
#include <Common/MemoryTrackerSwitcher.h> | ||
|
||
#include <mutex> | ||
#include <algorithm> | ||
|
||
|
||
namespace ProfileEvents | ||
{ | ||
|
@@ -19,6 +21,7 @@ namespace ProfileEvents | |
namespace CurrentMetrics | ||
{ | ||
extern const Metric AddressesActive; | ||
extern const Metric AddressesBanned; | ||
} | ||
|
||
namespace DB | ||
|
@@ -36,6 +39,7 @@ HostResolverMetrics HostResolver::getMetrics() | |
.expired = ProfileEvents::AddressesExpired, | ||
.failed = ProfileEvents::AddressesMarkedAsFailed, | ||
.active_count = CurrentMetrics::AddressesActive, | ||
.banned_count = CurrentMetrics::AddressesBanned, | ||
}; | ||
} | ||
|
||
|
@@ -47,7 +51,7 @@ HostResolver::WeakPtr HostResolver::getWeakFromThis() | |
HostResolver::HostResolver(String host_, Poco::Timespan history_) | ||
: host(std::move(host_)) | ||
, history(history_) | ||
, resolve_function([](const String & host_to_resolve) { return DNSResolver::instance().resolveHostAll(host_to_resolve); }) | ||
, resolve_function([](const String & host_to_resolve) { return DNSResolver::instance().resolveHostAllInOriginOrder(host_to_resolve); }) | ||
{ | ||
update(); | ||
} | ||
|
@@ -62,6 +66,12 @@ HostResolver::HostResolver( | |
HostResolver::~HostResolver() | ||
{ | ||
std::lock_guard lock(mutex); | ||
|
||
auto banned_count = 0; | ||
for (const auto & rec: records) | ||
banned_count += rec.failed; | ||
CurrentMetrics::sub(metrics.banned_count, banned_count); | ||
|
||
CurrentMetrics::sub(metrics.active_count, records.size()); | ||
records.clear(); | ||
} | ||
|
@@ -113,6 +123,7 @@ void HostResolver::updateWeights() | |
|
||
if (getTotalWeight() == 0 && !records.empty()) | ||
{ | ||
CurrentMetrics::sub(metrics.banned_count, records.size()); | ||
for (auto & rec : records) | ||
rec.failed = false; | ||
|
||
|
@@ -140,7 +151,7 @@ void HostResolver::setSuccess(const Poco::Net::IPAddress & address) | |
return; | ||
|
||
auto old_weight = it->getWeight(); | ||
++it->usage; | ||
it->setSuccess(); | ||
auto new_weight = it->getWeight(); | ||
|
||
if (old_weight != new_weight) | ||
|
@@ -158,8 +169,8 @@ void HostResolver::setFail(const Poco::Net::IPAddress & address) | |
if (it == records.end()) | ||
return; | ||
|
||
it->failed = true; | ||
it->fail_time = now; | ||
if (it->setFail(now)) | ||
CurrentMetrics::add(metrics.banned_count); | ||
} | ||
|
||
ProfileEvents::increment(metrics.failed); | ||
|
@@ -216,14 +227,20 @@ void HostResolver::updateImpl(Poco::Timestamp now, std::vector<Poco::Net::IPAddr | |
{ | ||
CurrentMetrics::sub(metrics.active_count, 1); | ||
ProfileEvents::increment(metrics.expired, 1); | ||
if (it_before->failed) | ||
CurrentMetrics::sub(metrics.banned_count); | ||
} | ||
++it_before; | ||
} | ||
else if (it_before == records.end() || (it_next != next_gen.end() && *it_next < it_before->address)) | ||
{ | ||
CurrentMetrics::add(metrics.active_count, 1); | ||
ProfileEvents::increment(metrics.discovered, 1); | ||
merged.push_back(Record(*it_next, now)); | ||
/// there are could be duplicates in next_gen vector | ||
if (merged.empty() || merged.back().address != *it_next) | ||
{ | ||
CurrentMetrics::add(metrics.active_count, 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we do not update metrics for duplicates. |
||
ProfileEvents::increment(metrics.discovered, 1); | ||
merged.push_back(Record(*it_next, now)); | ||
} | ||
++it_next; | ||
} | ||
else | ||
|
@@ -237,10 +254,22 @@ void HostResolver::updateImpl(Poco::Timestamp now, std::vector<Poco::Net::IPAddr | |
} | ||
|
||
for (auto & rec : merged) | ||
if (rec.failed && rec.fail_time < last_effective_resolve) | ||
rec.failed = false; | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I adjust new counter |
||
if (!rec.failed) | ||
continue; | ||
|
||
/// Exponential increased time for each consecutive fail | ||
auto banned_until = now - Poco::Timespan(history.totalMicroseconds() * (1ull << (rec.consecutive_fail_count - 1))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I use |
||
if (rec.fail_time < banned_until) | ||
{ | ||
rec.failed = false; | ||
CurrentMetrics::sub(metrics.banned_count); | ||
} | ||
} | ||
|
||
chassert(std::is_sorted(merged.begin(), merged.end())); | ||
// check that merged contains unuque elements | ||
chassert(std::adjacent_find(merged.begin(), merged.end()) == merged.end()); | ||
|
||
last_resolve_time = now; | ||
records.swap(merged); | ||
|
@@ -251,6 +280,7 @@ void HostResolver::updateImpl(Poco::Timestamp now, std::vector<Poco::Net::IPAddr | |
updateWeights(); | ||
} | ||
|
||
|
||
size_t HostResolver::getTotalWeight() const | ||
{ | ||
if (records.empty()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,9 +39,11 @@ struct HostResolverMetrics | |
const ProfileEvents::Event failed = ProfileEvents::end(); | ||
|
||
const CurrentMetrics::Metric active_count = CurrentMetrics::end(); | ||
const CurrentMetrics::Metric banned_count = CurrentMetrics::end(); | ||
}; | ||
|
||
constexpr size_t DEFAULT_RESOLVE_TIME_HISTORY_SECONDS = 2*60; | ||
constexpr size_t RECORD_CONSECTIVE_FAIL_COUNT_LIMIT = 6; | ||
|
||
|
||
class HostResolver : public std::enable_shared_from_this<HostResolver> | ||
|
@@ -141,6 +143,7 @@ class HostResolver : public std::enable_shared_from_this<HostResolver> | |
size_t usage = 0; | ||
bool failed = false; | ||
Poco::Timestamp fail_time = 0; | ||
size_t consecutive_fail_count = 0; | ||
|
||
size_t weight_prefix_sum; | ||
|
||
|
@@ -149,6 +152,11 @@ class HostResolver : public std::enable_shared_from_this<HostResolver> | |
return address < r.address; | ||
} | ||
|
||
bool operator ==(const Record & r) const | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needs for is_unuque check under |
||
{ | ||
return address == r.address; | ||
} | ||
|
||
size_t getWeight() const | ||
{ | ||
if (failed) | ||
|
@@ -166,6 +174,28 @@ class HostResolver : public std::enable_shared_from_this<HostResolver> | |
return 8; | ||
return 10; | ||
} | ||
|
||
bool setFail(const Poco::Timestamp & now) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. return true if status has chenged. Needs for adjusting metrics. |
||
{ | ||
bool was_ok = !failed; | ||
|
||
failed = true; | ||
fail_time = now; | ||
|
||
if (was_ok) | ||
{ | ||
if (consecutive_fail_count < RECORD_CONSECTIVE_FAIL_COUNT_LIMIT) | ||
++consecutive_fail_count; | ||
} | ||
|
||
return was_ok; | ||
} | ||
|
||
void setSuccess() | ||
{ | ||
consecutive_fail_count = 0; | ||
++usage; | ||
} | ||
}; | ||
|
||
using Records = std::vector<Record>; | ||
|
@@ -178,6 +208,7 @@ class HostResolver : public std::enable_shared_from_this<HostResolver> | |
void updateWeights() TSA_REQUIRES(mutex); | ||
void updateWeightsImpl() TSA_REQUIRES(mutex); | ||
size_t getTotalWeight() const TSA_REQUIRES(mutex); | ||
Poco::Timespan getRecordHistoryTime(const Record&) const; | ||
|
||
const String host; | ||
const Poco::Timespan history; | ||
|
@@ -188,7 +219,7 @@ class HostResolver : public std::enable_shared_from_this<HostResolver> | |
|
||
std::mutex mutex; | ||
|
||
Poco::Timestamp last_resolve_time TSA_GUARDED_BY(mutex); | ||
Poco::Timestamp last_resolve_time TSA_GUARDED_BY(mutex) = Poco::Timestamp::TIMEVAL_MIN; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just to be sure that HostResolver::update is called in c-tor even if history is |
||
Records records TSA_GUARDED_BY(mutex); | ||
|
||
Poco::Logger * log = &Poco::Logger::get("ConnectionPool"); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some of records may be alive here, need to decrease only for records with failed==true, I guess.
Like
And technically metric will jump from some value to zero and back when IP still unavailable.
So may be more clean to return bool
was_banned
from setSuccess method and decrement counter when changed there?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are here only when we need to reset failed flag for all records.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updateWeights
is called inupdateImpl
too.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I do not understand what you are pointing at.
update
fetch new hosts from resolver cache and callsupdateImpl
under a lock.updateImpl
applys new set of IP's to the inner stage and callsupdateWeights
.updateWeights
firstly callsupdateWeightsImpl
.updateWeightsImpl
calculate new weights for random choises.after that
updateWeights
checks if we seceded in that weight calculation. If all hosts are benned as faulty then we reset faulty flags for all the records. It is our only choice here. And afterupdateWeights
callsupdateWeightsImpl
one more time.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updateWeights
is called in two cases -- 1. we doupdate
. 2. Some records changed their weight.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I missed that getTotalWeight() == 0 when this code call.
Sorry,my mistake.