Skip to content

Commit

Permalink
apple dns: retry connection establishment (#13942)
Browse files Browse the repository at this point in the history
Commit Message: apple dns - retry connection establishment
Risk Level: med
Testing: added new tests.

Signed-off-by: Jose Nino <[email protected]>
  • Loading branch information
junr03 authored Nov 11, 2020
1 parent d342e96 commit a55ae7e
Show file tree
Hide file tree
Showing 12 changed files with 260 additions and 32 deletions.
2 changes: 2 additions & 0 deletions docs/root/intro/arch_overview/listeners/dns_filter.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _arch_overview_dns_filter:

DNS Filter
==========

Expand Down
24 changes: 24 additions & 0 deletions docs/root/intro/arch_overview/upstream/dns_resolution.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
.. _arch_overview_dns_resolution:

DNS Resolution
==============

Many Envoy components resolve DNS: different cluster types (
:ref:`strict dns <arch_overview_service_discovery_types_strict_dns>`,
:ref:`logical dns <arch_overview_service_discovery_types_logical_dns>`);
the :ref:`dynamic forward proxy <arch_overview_http_dynamic_forward_proxy>` system (which is
composed of a cluster and a filter);
the udp :ref:`dns filter <arch_overview_dns_filter>`, etc.
Envoy uses `c-ares <https://github.com/c-ares/c-ares>`_ as a third party DNS resolution library.
On Apple OSes Envoy additionally offers resolution using Apple specific APIs via the
``envoy.restart_features.use_apple_api_for_dns_lookups`` runtime feature.

The Apple-based DNS Resolver emits the following stats rooted in the ``dns.apple`` stats tree:

.. csv-table::
:header: Name, Type, Description
:widths: 1, 1, 2

connection_failure, Counter, Number of failed attempts to connect to the DNS server
socket_failure, Counter, Number of failed attempts to obtain a file descriptor to the socket to the DNS server
processing_failure, Counter, Number of failures when processing data from the DNS server
1 change: 1 addition & 0 deletions docs/root/intro/arch_overview/upstream/upstream.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Upstream clusters

cluster_manager
service_discovery
dns_resolution
health_checking
connection_pooling
load_balancing/load_balancing
Expand Down
4 changes: 2 additions & 2 deletions include/envoy/api/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ class Api {
virtual TimeSource& timeSource() PURE;

/**
* @return a constant reference to the root Stats::Scope
* @return a reference to the root Stats::Scope
*/
virtual const Stats::Scope& rootScope() PURE;
virtual Stats::Scope& rootScope() PURE;

/**
* @return a reference to the RandomGenerator.
Expand Down
2 changes: 1 addition & 1 deletion source/common/api/api_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Impl : public Api {
Thread::ThreadFactory& threadFactory() override { return thread_factory_; }
Filesystem::Instance& fileSystem() override { return file_system_; }
TimeSource& timeSource() override { return time_system_; }
const Stats::Scope& rootScope() override { return store_; }
Stats::Scope& rootScope() override { return store_; }
Random::RandomGenerator& randomGenerator() override { return random_generator_; }
ProcessContextOptRef processContext() override { return process_context_; }

Expand Down
6 changes: 3 additions & 3 deletions source/common/event/dispatcher_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,11 @@ Network::DnsResolverSharedPtr DispatcherImpl::createDnsResolver(
"using TCP for DNS lookups is not possible when using Apple APIs for DNS "
"resolution. Apple' API only uses UDP for DNS resolution. Use UDP or disable "
"the envoy.restart_features.use_apple_api_for_dns_lookups runtime feature.");
return Network::DnsResolverSharedPtr{new Network::AppleDnsResolverImpl(*this)};
return std::make_shared<Network::AppleDnsResolverImpl>(*this, api_.randomGenerator(),
api_.rootScope());
}
#endif
return Network::DnsResolverSharedPtr{
new Network::DnsResolverImpl(*this, resolvers, use_tcp_for_dns_lookups)};
return std::make_shared<Network::DnsResolverImpl>(*this, resolvers, use_tcp_for_dns_lookups);
}

FileEventPtr DispatcherImpl::createFileEvent(os_fd_t fd, FileReadyCb cb, FileTriggerType trigger,
Expand Down
2 changes: 2 additions & 0 deletions source/common/network/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,10 @@ envoy_cc_library(
":utility_lib",
"//include/envoy/event:dispatcher_interface",
"//include/envoy/event:file_event_interface",
"//include/envoy/event:timer_interface",
"//include/envoy/network:dns_interface",
"//source/common/common:assert_lib",
"//source/common/common:backoff_lib",
"//source/common/common:linked_object",
"//source/common/singleton:threadsafe_singleton",
],
Expand Down
75 changes: 65 additions & 10 deletions source/common/network/apple_dns_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,26 @@ DNSServiceErrorType DnsService::dnsServiceGetAddrInfo(DNSServiceRef* sdRef, DNSS
return DNSServiceGetAddrInfo(sdRef, flags, interfaceIndex, protocol, hostname, callBack, context);
}

AppleDnsResolverImpl::AppleDnsResolverImpl(Event::Dispatcher& dispatcher)
: dispatcher_(dispatcher) {
// Parameters of the jittered backoff strategy.
static constexpr std::chrono::milliseconds RetryInitialDelayMilliseconds(30);
static constexpr std::chrono::milliseconds RetryMaxDelayMilliseconds(30000);

AppleDnsResolverImpl::AppleDnsResolverImpl(Event::Dispatcher& dispatcher,
Random::RandomGenerator& random,
Stats::Scope& root_scope)
: dispatcher_(dispatcher), initialize_failure_timer_(dispatcher.createTimer(
[this]() -> void { initializeMainSdRef(); })),
backoff_strategy_(std::make_unique<JitteredExponentialBackOffStrategy>(
RetryInitialDelayMilliseconds.count(), RetryMaxDelayMilliseconds.count(), random)),
scope_(root_scope.createScope("dns.apple.")), stats_(generateAppleDnsResolverStats(*scope_)) {
ENVOY_LOG(debug, "Constructing DNS resolver");
initializeMainSdRef();
}

AppleDnsResolverImpl::~AppleDnsResolverImpl() {
ENVOY_LOG(debug, "Destructing DNS resolver");
deallocateMainSdRef();
AppleDnsResolverImpl::~AppleDnsResolverImpl() { deallocateMainSdRef(); }

AppleDnsResolverStats AppleDnsResolverImpl::generateAppleDnsResolverStats(Stats::Scope& scope) {
return {ALL_APPLE_DNS_RESOLVER_STATS(POOL_COUNTER(scope))};
}

void AppleDnsResolverImpl::deallocateMainSdRef() {
Expand All @@ -78,13 +89,28 @@ void AppleDnsResolverImpl::initializeMainSdRef() {
// However, using a shared connection brings some complexities detailed in the inline comments
// for kDNSServiceFlagsShareConnection in dns_sd.h, and copied (and edited) in this implementation
// where relevant.
//
// When error occurs while the main_sd_ref_ is initialized, the initialize_failure_timer_ will be
// enabled to retry initialization. Retries can also be triggered via query submission, @see
// AppleDnsResolverImpl::resolve(...) for details.
auto error = DnsServiceSingleton::get().dnsServiceCreateConnection(&main_sd_ref_);
RELEASE_ASSERT(error == kDNSServiceErr_NoError,
fmt::format("error={} in DNSServiceCreateConnection", error));
if (error != kDNSServiceErr_NoError) {
stats_.connection_failure_.inc();
initialize_failure_timer_->enableTimer(
std::chrono::milliseconds(backoff_strategy_->nextBackOffMs()));
return;
}

auto fd = DnsServiceSingleton::get().dnsServiceRefSockFD(main_sd_ref_);
RELEASE_ASSERT(fd != -1, "error in DNSServiceRefSockFD");
ENVOY_LOG(debug, "DNS resolver has fd={}", fd);
// According to dns_sd.h: DnsServiceRefSockFD returns "The DNSServiceRef's underlying socket
// descriptor, or -1 on error.". Although it gives no detailed description on when/why this call
// would fail.
if (fd == -1) {
stats_.socket_failure_.inc();
initialize_failure_timer_->enableTimer(
std::chrono::milliseconds(backoff_strategy_->nextBackOffMs()));
return;
}

sd_ref_event_ = dispatcher_.createFileEvent(
fd,
Expand All @@ -93,6 +119,12 @@ void AppleDnsResolverImpl::initializeMainSdRef() {
[this](uint32_t events) { onEventCallback(events); }, Event::FileTriggerType::Level,
Event::FileReadyType::Read);
sd_ref_event_->setEnabled(Event::FileReadyType::Read);

// Disable the failure timer and reset the backoff strategy because the main_sd_ref_ was
// successfully initialized. Note that these actions will be no-ops if the timer was not armed to
// begin with.
initialize_failure_timer_->disableTimer();
backoff_strategy_->reset();
}

void AppleDnsResolverImpl::onEventCallback(uint32_t events) {
Expand All @@ -102,6 +134,7 @@ void AppleDnsResolverImpl::onEventCallback(uint32_t events) {
DNSServiceErrorType error = DnsServiceSingleton::get().dnsServiceProcessResult(main_sd_ref_);
if (error != kDNSServiceErr_NoError) {
ENVOY_LOG(warn, "DNS resolver error ({}) in DNSServiceProcessResult", error);
stats_.processing_failure_.inc();
// Similar to receiving an error in onDNSServiceGetAddrInfoReply, an error while processing fd
// events indicates that the sd_ref state is broken.
// Therefore, flush queries with_error == true.
Expand All @@ -125,7 +158,28 @@ ActiveDnsQuery* AppleDnsResolverImpl::resolve(const std::string& dns_name,
dns_name, address->asString());
} catch (const EnvoyException& e) {
// Resolution via Apple APIs
ENVOY_LOG(debug, "DNS resolver local resolution failed with: {}", e.what());
ENVOY_LOG(trace, "DNS resolver local resolution failed with: {}", e.what());

// First check that the main_sd_ref is alive by checking if the resolver is currently trying to
// initialize its main_sd_ref.
if (initialize_failure_timer_->enabled()) {
// No queries should be accumulating while the main_sd_ref_ is not alive. Either they were
// flushed when the error that deallocated occurred, or they have all failed in this branch of
// the code synchronously due to continuous inability to initialize the main_sd_ref_.
ASSERT(queries_with_pending_cb_.empty());

// Short-circuit the pending retry to initialize the main_sd_ref_ and try now.
initializeMainSdRef();

// If the timer is still enabled, that means the initialization failed. Synchronously fail the
// resolution, the callback target should retry.
if (initialize_failure_timer_->enabled()) {
callback(DnsResolver::ResolutionStatus::Failure, {});
return nullptr;
}
}

// Proceed with resolution after establishing that the resolver has a live main_sd_ref_.
std::unique_ptr<PendingResolution> pending_resolution(
new PendingResolution(*this, callback, dispatcher_, main_sd_ref_, dns_name));

Expand Down Expand Up @@ -204,6 +258,7 @@ AppleDnsResolverImpl::PendingResolution::~PendingResolution() {
// thus the DNSServiceRef is null.
// Therefore, only deallocate if the ref is not null.
if (individual_sd_ref_) {
ENVOY_LOG(debug, "DNSServiceRefDeallocate individual sd ref");
DnsServiceSingleton::get().dnsServiceRefDeallocate(individual_sd_ref_);
}
}
Expand Down
27 changes: 25 additions & 2 deletions source/common/network/apple_dns_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
#include "envoy/common/platform.h"
#include "envoy/event/dispatcher.h"
#include "envoy/event/file_event.h"
#include "envoy/event/timer.h"
#include "envoy/network/dns.h"

#include "common/common/backoff_strategy.h"
#include "common/common/linked_object.h"
#include "common/common/logger.h"
#include "common/common/utility.h"
Expand Down Expand Up @@ -37,14 +39,31 @@ class DnsService {

using DnsServiceSingleton = ThreadSafeSingleton<DnsService>;

/**
* All DNS resolver stats. @see stats_macros.h
*/
#define ALL_APPLE_DNS_RESOLVER_STATS(COUNTER) \
COUNTER(connection_failure) \
COUNTER(socket_failure) \
COUNTER(processing_failure)

/**
* Struct definition for all DNS resolver stats. @see stats_macros.h
*/
struct AppleDnsResolverStats {
ALL_APPLE_DNS_RESOLVER_STATS(GENERATE_COUNTER_STRUCT)
};

/**
* Implementation of DnsResolver that uses Apple dns_sd.h APIs. All calls and callbacks are assumed
* to happen on the thread that owns the creating dispatcher.
*/
class AppleDnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logger::Id::upstream> {
public:
AppleDnsResolverImpl(Event::Dispatcher& dispatcher);
AppleDnsResolverImpl(Event::Dispatcher& dispatcher, Random::RandomGenerator& random,
Stats::Scope& root_scope);
~AppleDnsResolverImpl() override;
static AppleDnsResolverStats generateAppleDnsResolverStats(Stats::Scope& scope);

// Network::DnsResolver
ActiveDnsQuery* resolve(const std::string& dns_name, DnsLookupFamily dns_lookup_family,
Expand Down Expand Up @@ -111,8 +130,12 @@ class AppleDnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logg
void flushPendingQueries(const bool with_error);

Event::Dispatcher& dispatcher_;
DNSServiceRef main_sd_ref_;
Event::TimerPtr initialize_failure_timer_;
BackOffStrategyPtr backoff_strategy_;
DNSServiceRef main_sd_ref_{nullptr};
Event::FileEventPtr sd_ref_event_;
Stats::ScopePtr scope_;
AppleDnsResolverStats stats_;
// When using a shared sd ref via DNSServiceCreateConnection, the DNSServiceGetAddrInfoReply
// callback with the kDNSServiceFlagsMoreComing flag might refer to addresses for various
// PendingResolutions. Therefore, the resolver needs to have a container of queries pending
Expand Down
2 changes: 2 additions & 0 deletions test/common/network/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,10 @@ envoy_cc_test(
"//include/envoy/network:dns_interface",
"//source/common/event:dispatcher_includes",
"//include/envoy/event:file_event_interface",
"//source/common/stats:isolated_store_lib",
"//source/common/event:dispatcher_lib",
"//source/common/network:address_lib",
"//source/common/common:random_generator_lib",
"//test/test_common:environment_lib",
"//test/test_common:network_utility_lib",
"//test/test_common:utility_lib",
Expand Down
Loading

0 comments on commit a55ae7e

Please sign in to comment.