Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return an error if we can't connect and limit the number of retries #813

Merged
merged 5 commits into from
Jan 29, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions gossipd/gossip.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ struct reaching {

/* Did we succeed? */
bool succeeded;

/* How many times have we attempted to connect? */
u32 attempts;

/* How many timest to attempt */
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: comment misspelling "timest"

u32 max_attempts;

/* Timestamp of the first attempt */
u32 first_attempt;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mildly prefer timestamps as struct timeabs, which is clearer, but not worth another round of feedback for.

};

/* Things we need when we're talking direct to the peer. */
Expand Down Expand Up @@ -1557,13 +1566,25 @@ static void try_connect(struct reaching *reach);

static void connect_failed(struct io_conn *conn, struct reaching *reach)
{
status_trace("Failed connected out for %s, will try again",
type_to_string(trc, struct pubkey, &reach->id));

/* FIXME: Configurable timer! */
new_reltimer(&reach->daemon->timers, reach,
time_from_sec(5),
try_connect, reach);
u32 diff = time_now().ts.tv_sec - reach->first_attempt;
reach->attempts++;

if (reach->attempts >= reach->max_attempts) {
status_trace("Failed to connect after %d attempts, giving up "
"after %d seconds",
reach->attempts, diff);
daemon_conn_send(
&reach->daemon->master,
take(towire_gossip_peer_connection_failed(
conn, &reach->id, diff, reach->attempts, false)));
tal_free(reach);
} else {
status_trace("Failed connected out for %s, will try again",
type_to_string(trc, struct pubkey, &reach->id));
/* FIXME: Configurable timer! */
new_reltimer(&reach->daemon->timers, reach, time_from_sec(5),
try_connect, reach);
}
}

static struct io_plan *conn_init(struct io_conn *conn, struct reaching *reach)
Expand Down Expand Up @@ -1625,6 +1646,12 @@ static void try_connect(struct reaching *reach)
/* FIXME: add reach_failed message */
status_trace("No address known for %s, giving up",
type_to_string(trc, struct pubkey, &reach->id));
daemon_conn_send(
&reach->daemon->master,
take(towire_gossip_peer_connection_failed(
reach, &reach->id,
time_now().ts.tv_sec - reach->first_attempt,
reach->attempts, true)));
tal_free(reach);
return;
}
Expand Down Expand Up @@ -1683,6 +1710,9 @@ static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id)
reach->succeeded = false;
reach->daemon = daemon;
reach->id = *id;
reach->first_attempt = time_now().ts.tv_sec;
reach->attempts = 0;
reach->max_attempts = 10;
list_add_tail(&daemon->reaching, &reach->list);
tal_add_destructor(reach, destroy_reaching);

Expand Down Expand Up @@ -1913,6 +1943,7 @@ static struct io_plan *recv_req(struct io_conn *conn, struct daemon_conn *master
case WIRE_GOSSIP_RESOLVE_CHANNEL_REPLY:
case WIRE_GOSSIP_PEER_CONNECTED:
case WIRE_GOSSIP_PEER_ALREADY_CONNECTED:
case WIRE_GOSSIP_PEER_CONNECTION_FAILED:
case WIRE_GOSSIP_PEER_NONGOSSIP:
case WIRE_GOSSIP_GET_UPDATE:
case WIRE_GOSSIP_GET_UPDATE_REPLY:
Expand Down
7 changes: 7 additions & 0 deletions gossipd/gossip_wire.csv
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ gossip_peer_connected,,lfeatures,lflen*u8
gossip_peer_already_connected,3015
gossip_peer_already_connected,,id,struct pubkey

# gossipd -> master: attempted to connect, unsuccessful, gave up
gossip_peer_connection_failed,3020
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need some good way to allocate message numbers, now I need to bump the message number in #638 again...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, fortunately compile will fail with a clash, so whoever is applying it can rebase and fixup...

gossip_peer_connection_failed,,id,struct pubkey
gossip_peer_connection_failed,,timeout,u32
gossip_peer_connection_failed,,attempts,u32
gossip_peer_connection_failed,,addr_unknown,bool

# Gossipd -> master: peer sent non-gossip packet. Two fds: peer and gossip
gossip_peer_nongossip,3003
gossip_peer_nongossip,,id,struct pubkey
Expand Down
3 changes: 3 additions & 0 deletions lightningd/gossip_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ static unsigned gossip_msg(struct subd *gossip, const u8 *msg, const int *fds)
case WIRE_GOSSIP_PEER_ALREADY_CONNECTED:
peer_already_connected(gossip->ld, msg);
break;
case WIRE_GOSSIP_PEER_CONNECTION_FAILED:
peer_connection_failed(gossip->ld, msg);
break;
case WIRE_GOSSIP_PEER_NONGOSSIP:
if (tal_count(fds) != 2)
return 2;
Expand Down
35 changes: 35 additions & 0 deletions lightningd/peer_control.c
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,41 @@ void peer_already_connected(struct lightningd *ld, const u8 *msg)
connect_succeeded(ld, &id);
}

void peer_connection_failed(struct lightningd *ld, const u8 *msg)
{
struct pubkey id;
u32 attempts, timediff;
struct connect *i, *next;
bool addr_unknown;
char *error;

if (!fromwire_gossip_peer_connection_failed(msg, NULL, &id, &attempts,
&timediff, &addr_unknown))
fatal(
"Gossip gave bad GOSSIP_PEER_CONNECTION_FAILED message %s",
tal_hex(msg, msg));

if (addr_unknown) {
error = tal_fmt(
msg, "No address known for node %s, please provide one",
type_to_string(msg, struct pubkey, &id));
} else {
error = tal_fmt(msg, "Could not connect to %s after %d seconds and %d attempts",
type_to_string(msg, struct pubkey, &id), timediff,
attempts);
}

/* Careful! Completing command frees connect. */
list_for_each_safe(&ld->connects, i, next, list) {
if (!pubkey_eq(&i->id, &id))
continue;

command_fail(i->cmd, "%s", error);
}
}



Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a lot of vertical space :)

void peer_sent_nongossip(struct lightningd *ld,
const struct pubkey *id,
const struct wireaddr *addr,
Expand Down
3 changes: 3 additions & 0 deletions lightningd/peer_control.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ void peer_connected(struct lightningd *ld, const u8 *msg,
/* This simply means we asked to reach a peer, but we already have it */
void peer_already_connected(struct lightningd *ld, const u8 *msg);

/* We were unable to connect to the peer */
void peer_connection_failed(struct lightningd *ld, const u8 *msg);

void peer_sent_nongossip(struct lightningd *ld,
const struct pubkey *id,
const struct wireaddr *addr,
Expand Down