From 62f9012fcdc17acbc7c0f6a462890ecdcec9f724 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 10 Aug 2017 21:00:13 +0000 Subject: [PATCH 1/4] test/spellcheck: fix TAP output when test is skipped When the spellcheck tests are skipped (due to missing aspell for example), the skip TAP output is incorrect, or at least is deemed incorrect by the tap-driver.sh automake TAP driver, which causes the test to fail. Change the output to `1..0 # skip` to make the test driver happy. --- doc/test/spellcheck | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/test/spellcheck b/doc/test/spellcheck index b59fc773544a..3d6d34fb4192 100755 --- a/doc/test/spellcheck +++ b/doc/test/spellcheck @@ -19,7 +19,7 @@ if egrep -q '^[[:space:]]*$' $dict; then exit 1 fi if test -z "$ASPELL"; then - echo "1..$# # skip because aspell is not installed" + echo "1..0 # skip because aspell is not installed" exit 0 fi if ! $ASPELL -n list /dev/null; then From 352c2dff74d3abc71305e4b57657ae44d96c2306 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 10 Aug 2017 18:21:57 +0000 Subject: [PATCH 2/4] connector/local: rename connect retry count Rename local connector retry count from FLUX_RETRY_COUNT to FLUX_LOCAL_CONNECTOR_RETRY_COUNT ass suggested in issue #677. --- src/connectors/local/local.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectors/local/local.c b/src/connectors/local/local.c index a4f47d24b503..e723550fc6e2 100644 --- a/src/connectors/local/local.c +++ b/src/connectors/local/local.c @@ -261,7 +261,7 @@ flux_t *connector_init (const char *path, int flags) goto error; c->fd_nonblock = -1; for (count=0;;count++) { - if (count >= env_getint("FLUX_RETRY_COUNT", 5)) + if (count >= env_getint("FLUX_LOCAL_CONNECTOR_RETRY_COUNT", 5)) goto error; memset (&addr, 0, sizeof (struct sockaddr_un)); addr.sun_family = AF_UNIX; From 21b2dd004a6ff1093f6ed5ea9542f6b47d7ccaae Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 10 Aug 2017 20:57:41 +0000 Subject: [PATCH 3/4] connector/local: add exponential backoff during connect Introduce an exponential backoff to the local connector's attempts to connect to the flux instance api domain socket. The backoff starts at 16ms and doubles up to 2s. This is so that the default retry count of 5 still times out after about 500ms as in the current code. --- src/connectors/local/local.c | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/connectors/local/local.c b/src/connectors/local/local.c index e723550fc6e2..404b2dabc865 100644 --- a/src/connectors/local/local.c +++ b/src/connectors/local/local.c @@ -228,14 +228,39 @@ static int env_getint (char *name, int dflt) return s ? strtol (s, NULL, 10) : dflt; } +/* Connect socket `fd` to unix domain socket `file` and fail after `retries` + * attempts with exponential retry backoff starting at 16ms. + * Return 0 on success, or -1 on failure. + */ +static int connect_sock_with_retry (int fd, const char *file, int retries) +{ + int count = 0; + struct sockaddr_un addr; + useconds_t s = 8 * 1000; + int maxdelay = 2000000; + do { + memset (&addr, 0, sizeof (struct sockaddr_un)); + addr.sun_family = AF_UNIX; + if (strncpy (addr.sun_path, file, sizeof (addr.sun_path) - 1) < 0) { + errno = EINVAL; + return -1; + } + if (connect (fd, (struct sockaddr *)&addr, sizeof (addr)) == 0) + return 0; + if (s < maxdelay) + s = 2*s < maxdelay ? 2*s : maxdelay; + } while ((++count <= retries) && (usleep (s) == 0)); + return -1; +} + /* Path is interpreted as the directory containing the unix domain socket. */ flux_t *connector_init (const char *path, int flags) { local_ctx_t *c = NULL; - struct sockaddr_un addr; char sockfile[PATH_MAX + 1]; - int n, count; + int n; + int retries = env_getint ("FLUX_LOCAL_CONNECTOR_RETRY_COUNT", 5); if (!path) { errno = EINVAL; @@ -260,17 +285,8 @@ flux_t *connector_init (const char *path, int flags) if (c->fd < 0) goto error; c->fd_nonblock = -1; - for (count=0;;count++) { - if (count >= env_getint("FLUX_LOCAL_CONNECTOR_RETRY_COUNT", 5)) - goto error; - memset (&addr, 0, sizeof (struct sockaddr_un)); - addr.sun_family = AF_UNIX; - strncpy (addr.sun_path, sockfile, sizeof (addr.sun_path) - 1); - if (connect (c->fd, (struct sockaddr *)&addr, - sizeof (struct sockaddr_un)) == 0) - break; - usleep (100*1000); - } + if (connect_sock_with_retry (c->fd, sockfile, retries) < 0) + goto error; /* read 1 byte indicating success or failure of auth */ unsigned char e; int rc; From 56b29b87b77d36bf572f0147dc7a24923ee1ef83 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Thu, 10 Aug 2017 18:26:26 +0000 Subject: [PATCH 4/4] t5000-valgrind.t: allow more connect retries in flux_open Allow more retries in flux_open in local connector as suggested in #677. --- t/t5000-valgrind.t | 3 +++ 1 file changed, 3 insertions(+) diff --git a/t/t5000-valgrind.t b/t/t5000-valgrind.t index 3c8c1de2c691..68f03b05ab46 100755 --- a/t/t5000-valgrind.t +++ b/t/t5000-valgrind.t @@ -30,6 +30,9 @@ VALGRIND_SUPPRESSIONS=${SHARNESS_TEST_SRCDIR}/valgrind/valgrind.supp VALGRIND_WORKLOAD=${SHARNESS_TEST_SRCDIR}/valgrind/valgrind-workload.sh BROKER=${FLUX_BUILD_DIR}/src/broker/.libs/lt-flux-broker +# broker run under valgrind may need extra retries in flux_open(): +FLUX_LOCAL_CONNECTOR_RETRY_COUNT=10 + if ! test -x $BROKER; then ${FLUX_BUILD_DIR}/src/broker/flux-broker --help >/dev/null 2>&1 fi