Skip to content

Commit

Permalink
ZOOKEEPER-3723: Zookeeper Client should not fail with ZSYSTEMERROR if…
Browse files Browse the repository at this point in the history
… DNS does not resolve one of the servers in the zk ensemble

The change is backported from
Jeelani Mohamed Abdul Khader <mjeelanidevvm3360.prn2.facebook.com>'s
#579 change done in
"ZOOKEEPER-3095: Connect string fix for non-existent hosts"

Code changes to not fail the session initiation if the DNS is not able to
resolve the hostname of one of the servers in the Zookeeper ensemble.

Some Background:
The Zookeeper client resolves all the hostnames in the ensemble while establishing the session.
In Kubernetes environment with coreDNS, the hostname entry gets removed from coreDNS during the POD restarts.
Though we can manipulate the coreDNS settings to delay the removal of the hostname entry from DNS,
we don't want to leave any race where Zookeeper client is trying to establish a session and it fails
because the DNS temporarily is not able to resolve the hostname. So as long as one of the servers
in the ensemble is able to be DNS resolvable, should we not fail the session establishment with hard error
and instead try to establish the connection with one of the other servers

Author: Suhas Dantkale <[email protected]>

Reviewers: Enrico Olivelli <[email protected]>, Mate Szalay-Beko <[email protected]>, Damien Diederen <[email protected]>

Closes #1259 from suhasdantkale/suhas/ZOOKEEPER-3723
  • Loading branch information
suhasdantkale authored and eolivelli committed May 4, 2020
1 parent 3b5be59 commit d336e5a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 7 deletions.
18 changes: 11 additions & 7 deletions zookeeper-client/zookeeper-client-c/src/zookeeper.c
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ static int resolve_hosts(const zhandle_t *zh, const char *hosts_in, addrvec_t *a
LOG_ERROR(LOGCALLBACK(zh), "could not resolve %s", host);
errno=ENOENT;
rc=ZBADARGUMENTS;
goto fail;
goto next_host;
}

// Setup the address array
Expand Down Expand Up @@ -786,6 +786,7 @@ static int resolve_hosts(const zhandle_t *zh, const char *hosts_in, addrvec_t *a
addr->ss_family, hosts_in);
}
}
next_host:
host = strtok_r(0, ",", &strtok_last);
}
#else
Expand Down Expand Up @@ -826,14 +827,13 @@ static int resolve_hosts(const zhandle_t *zh, const char *hosts_in, addrvec_t *a
if (rc != 0) {
errno = getaddrinfo_errno(rc);
#ifdef _WIN32
LOG_ERROR(LOGCALLBACK(zh), "Win32 message: %s\n", gai_strerror(rc));
LOG_ERROR(LOGCALLBACK(zh), "%s: Win32 message: %s\n", host, gai_strerror(rc));
#elif __linux__ && __GNUC__
LOG_ERROR(LOGCALLBACK(zh), "getaddrinfo: %s\n", gai_strerror(rc));
LOG_ERROR(LOGCALLBACK(zh), "%s: getaddrinfo(): %s\n", host, gai_strerror(rc));
#else
LOG_ERROR(LOGCALLBACK(zh), "getaddrinfo: %s\n", strerror(errno));
LOG_ERROR(LOGCALLBACK(zh), "%s: getaddrinfo(): %s\n", host, strerror(errno));
#endif
rc=ZSYSTEMERROR;
goto fail;
goto next_host;
}
}

Expand Down Expand Up @@ -865,11 +865,15 @@ static int resolve_hosts(const zhandle_t *zh, const char *hosts_in, addrvec_t *a
}

freeaddrinfo(res0);

next_host:
host = strtok_r(0, ",", &strtok_last);
}
#endif
}
if (avec->count == 0) {
rc = ZSYSTEMERROR; // not a single host resolved
goto fail;
}
free(hosts);

if(!disable_conn_permute){
Expand Down
11 changes: 11 additions & 0 deletions zookeeper-client/zookeeper-client-c/tests/TestClient.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ class Zookeeper_simpleSystem : public CPPUNIT_NS::TestFixture
CPPUNIT_TEST(testAsyncWatcherAutoReset);
CPPUNIT_TEST(testDeserializeString);
CPPUNIT_TEST(testFirstServerDown);
CPPUNIT_TEST(testNonexistentHost);
#ifdef THREADED
CPPUNIT_TEST(testNullData);
#ifdef ZOO_IPV6_ENABLED
Expand Down Expand Up @@ -327,6 +328,16 @@ class Zookeeper_simpleSystem : public CPPUNIT_NS::TestFixture
CPPUNIT_ASSERT(zk != 0);
CPPUNIT_ASSERT(ctx.waitForConnected(zk));
}

/** Checks that a non-existent host will not block the connection **/
void testNonexistentHost() {
char hosts[] = "jimmy:5555,127.0.0.1:22181,someinvalid.host.com:1234";
watchctx_t ctx;
zoo_deterministic_conn_order(true /* disable permute */);
zhandle_t *zh = createClient(hosts, &ctx);
CPPUNIT_ASSERT(ctx.waitForConnected(zh));
zoo_deterministic_conn_order(false /* enable permute */);
}

/** this checks for a deadlock in calling zookeeper_close and calls from a default watcher that might get triggered just when zookeeper_close() is in progress **/
void testHangingClient() {
Expand Down

0 comments on commit d336e5a

Please sign in to comment.