diff --git a/src/flb_network.c b/src/flb_network.c index 31255a413f6..69eb73ef58a 100644 --- a/src/flb_network.c +++ b/src/flb_network.c @@ -446,14 +446,14 @@ static int net_connect_async(int fd, /* Connection is broken, not much to do here */ str = strerror_r(error, so_error_buf, sizeof(so_error_buf)); - flb_error("[net] TCP connection failed: %s:%i (%s)", - u->tcp_host, u->tcp_port, str); + flb_error("[net] TCP connection #%i failed: %s:%i (%s)", + u_conn->fd, u->tcp_host, u->tcp_port, str); return -1; } } else { - flb_error("[net] TCP connection, unexpected error: %s:%i", - u->tcp_host, u->tcp_port); + flb_error("[net] TCP connection #%i, unexpected error: %s:%i", + u_conn->fd, u->tcp_host, u->tcp_port); return -1; } @@ -1061,6 +1061,8 @@ flb_sockfd_t flb_net_tcp_connect(const char *host, unsigned long port, char _port[6]; struct addrinfo hints; struct addrinfo *res, *rp; + char so_error_buf[256] = {0}; + char *str = NULL; if (is_async == FLB_TRUE && !u_conn) { flb_error("[net] invalid async mode with not set upstream connection"); @@ -1081,6 +1083,30 @@ flb_sockfd_t flb_net_tcp_connect(const char *host, unsigned long port, if (is_async) { ret = flb_net_getaddrinfo(host, _port, &hints, &res, u_conn->u->net.dns_mode, connect_timeout); + /* + * When the output plugin is under load, DNS timers fire much later than + * they should and DNS lookups are slow as well. It is possible that + * the connection times out via the upstream handler. Since the socket + * has not yet been created, the upstream timeout handler doesn't have + * a valid fd to notify this coroutine on. A check is needed to see if + * the connection timed out even if the DNS lookup succeeded. + * FIXME: Ideally, the upstream timeout handler should be able to + * trigger an action that will cancel pending DNS queries. This could be + * implemented by storing the DNS socket in the upstream connection + * and having the upstream timeout handler call shutdown on it. It + * will need more plumbing through the code. + */ + if (u_conn->net_error > 0) { + str = strerror_r(u_conn->net_error, so_error_buf, sizeof(so_error_buf)); + flb_error("[net] TCP connection #%i failed because of an " + "upstream event after resolving DNS: error=%i:(%s)", + u_conn->fd, u_conn->net_error, str); + if (ret == 0 && res != NULL) { + flb_net_free_translated_addrinfo(res); + } + return -1; + } + } else { ret = getaddrinfo(host, _port, &hints, &res); diff --git a/src/flb_upstream.c b/src/flb_upstream.c index d2bd91072dd..19700c47ab0 100644 --- a/src/flb_upstream.c +++ b/src/flb_upstream.c @@ -794,11 +794,17 @@ int flb_upstream_conn_timeouts(struct mk_list *list) * Shutdown the connection, this is the safest way to indicate * that the socket cannot longer work and any co-routine on * waiting for I/O will receive the notification and trigger - * the error to it caller. + * the error to it caller. This only works if the connection + * has a valid fd, which is assigned after the DNS lookup + * succeeds. + * Do not call prepare_destroy_conn here since the connection + * is still pending. It will be handled when the function for + * creating a new connection returns. */ - shutdown(u_conn->fd, SHUT_RDWR); + if (u_conn->fd > -1) { + shutdown(u_conn->fd, SHUT_RDWR); + } u_conn->net_error = ETIMEDOUT; - prepare_destroy_conn(u_conn); } } diff --git a/src/tls/flb_tls.c b/src/tls/flb_tls.c index 5731fcd4077..8bbc1e162c7 100644 --- a/src/tls/flb_tls.c +++ b/src/tls/flb_tls.c @@ -293,6 +293,8 @@ int flb_tls_session_create(struct flb_tls *tls, int flag; struct flb_tls_session *session; struct flb_upstream *u = u_conn->u; + char so_error_buf[256] = {0}; + char *str = NULL; /* Create TLS session */ session = tls->api->session_create(tls, u_conn); @@ -318,6 +320,13 @@ int flb_tls_session_create(struct flb_tls *tls, ret = tls->api->net_handshake(tls, session); if (ret != 0) { if (ret != FLB_TLS_WANT_READ && ret != FLB_TLS_WANT_WRITE) { + if (u_conn->net_error > 0) { + str = strerror_r(u_conn->net_error, so_error_buf, sizeof(so_error_buf)); + flb_error("[io_tls] tls handshake for connection #%i to %s:%i " + "failed because of an upstream event: error=%i:(%s)", + u_conn->fd, u->tcp_host, u->tcp_port, + u_conn->net_error, str); + } goto error; }