diff --git a/doc/test/spellcheck b/doc/test/spellcheck index b59fc773544a..3d6d34fb4192 100755 --- a/doc/test/spellcheck +++ b/doc/test/spellcheck @@ -19,7 +19,7 @@ if egrep -q '^[[:space:]]*$' $dict; then exit 1 fi if test -z "$ASPELL"; then - echo "1..$# # skip because aspell is not installed" + echo "1..0 # skip because aspell is not installed" exit 0 fi if ! $ASPELL -n list /dev/null; then diff --git a/src/connectors/local/local.c b/src/connectors/local/local.c index a4f47d24b503..404b2dabc865 100644 --- a/src/connectors/local/local.c +++ b/src/connectors/local/local.c @@ -228,14 +228,39 @@ static int env_getint (char *name, int dflt) return s ? strtol (s, NULL, 10) : dflt; } +/* Connect socket `fd` to unix domain socket `file` and fail after `retries` + * attempts with exponential retry backoff starting at 16ms. + * Return 0 on success, or -1 on failure. + */ +static int connect_sock_with_retry (int fd, const char *file, int retries) +{ + int count = 0; + struct sockaddr_un addr; + useconds_t s = 8 * 1000; + int maxdelay = 2000000; + do { + memset (&addr, 0, sizeof (struct sockaddr_un)); + addr.sun_family = AF_UNIX; + if (strncpy (addr.sun_path, file, sizeof (addr.sun_path) - 1) < 0) { + errno = EINVAL; + return -1; + } + if (connect (fd, (struct sockaddr *)&addr, sizeof (addr)) == 0) + return 0; + if (s < maxdelay) + s = 2*s < maxdelay ? 2*s : maxdelay; + } while ((++count <= retries) && (usleep (s) == 0)); + return -1; +} + /* Path is interpreted as the directory containing the unix domain socket. */ flux_t *connector_init (const char *path, int flags) { local_ctx_t *c = NULL; - struct sockaddr_un addr; char sockfile[PATH_MAX + 1]; - int n, count; + int n; + int retries = env_getint ("FLUX_LOCAL_CONNECTOR_RETRY_COUNT", 5); if (!path) { errno = EINVAL; @@ -260,17 +285,8 @@ flux_t *connector_init (const char *path, int flags) if (c->fd < 0) goto error; c->fd_nonblock = -1; - for (count=0;;count++) { - if (count >= env_getint("FLUX_RETRY_COUNT", 5)) - goto error; - memset (&addr, 0, sizeof (struct sockaddr_un)); - addr.sun_family = AF_UNIX; - strncpy (addr.sun_path, sockfile, sizeof (addr.sun_path) - 1); - if (connect (c->fd, (struct sockaddr *)&addr, - sizeof (struct sockaddr_un)) == 0) - break; - usleep (100*1000); - } + if (connect_sock_with_retry (c->fd, sockfile, retries) < 0) + goto error; /* read 1 byte indicating success or failure of auth */ unsigned char e; int rc; diff --git a/t/t5000-valgrind.t b/t/t5000-valgrind.t index 3c8c1de2c691..68f03b05ab46 100755 --- a/t/t5000-valgrind.t +++ b/t/t5000-valgrind.t @@ -30,6 +30,9 @@ VALGRIND_SUPPRESSIONS=${SHARNESS_TEST_SRCDIR}/valgrind/valgrind.supp VALGRIND_WORKLOAD=${SHARNESS_TEST_SRCDIR}/valgrind/valgrind-workload.sh BROKER=${FLUX_BUILD_DIR}/src/broker/.libs/lt-flux-broker +# broker run under valgrind may need extra retries in flux_open(): +FLUX_LOCAL_CONNECTOR_RETRY_COUNT=10 + if ! test -x $BROKER; then ${FLUX_BUILD_DIR}/src/broker/flux-broker --help >/dev/null 2>&1 fi