Skip to content

Commit

Permalink
channeld: treat all incoming errors as "soft", so we retry.
Browse files Browse the repository at this point in the history
We still close the channel if we *send* an error, but we seem to have hit
another case where LND sends an error which seems transient, so this will
make a best-effort attempt to preserve our channel in that case.

Some test have to be modified, since they don't terminate as they did
previously :(

Changelog-Changed: quirks: We'll now reconnect and retry if we get an error on an established channel. This works around lnd sending error messages that may be non-fatal.

Signed-off-by: Rusty Russell <[email protected]>
  • Loading branch information
rustyrussell authored and cdecker committed Dec 13, 2019
1 parent 839909d commit 1d0c433
Show file tree
Hide file tree
Showing 10 changed files with 41 additions and 30 deletions.
7 changes: 5 additions & 2 deletions channeld/channeld.c
Original file line number Diff line number Diff line change
Expand Up @@ -1824,7 +1824,9 @@ static void peer_in(struct peer *peer, const u8 *msg)
return;
}

if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg))
/* Since LND seems to send errors which aren't actually fatal events,
* we treat errors here as soft. */
if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true, msg))
return;

/* Must get funding_locked before almost anything. */
Expand Down Expand Up @@ -2322,7 +2324,8 @@ static void peer_reconnect(struct peer *peer,
do {
clean_tmpctx();
msg = sync_crypto_read(tmpctx, peer->pps);
} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg)
} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true,
msg)
|| capture_premature_msg(&premature_msgs, msg));

if (peer->channel->option_static_remotekey) {
Expand Down
2 changes: 1 addition & 1 deletion closingd/closingd.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ static u8 *closing_read_peer_msg(const tal_t *ctx,
handle_gossip_msg(pps, take(msg));
continue;
}
if (!handle_peer_gossip_or_error(pps, channel_id, msg))
if (!handle_peer_gossip_or_error(pps, channel_id, false, msg))
return msg;
}
}
Expand Down
16 changes: 3 additions & 13 deletions common/peer_failed.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,15 @@ void peer_failed(struct per_peer_state *pps,
/* We're failing because peer sent us an error message */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
const struct channel_id *channel_id)
const struct channel_id *channel_id,
bool soft_error)
{
static const struct channel_id all_channels;
u8 *msg;
bool sync_error;

/* <+roasbeef> rusty: sync error can just be a timing thing
* <+roasbeef> andn doesn't always mean that we can't continue forwrd,
* or y'all sent the wrong info
*/
/* So while LND is sitting in the corner eating paint, back off. */
sync_error = strstr(desc, "sync error");
if (sync_error)
status_unusual("Peer said '%s' so we'll come back later",
desc);

if (!channel_id)
channel_id = &all_channels;
msg = towire_status_peer_error(NULL, channel_id, desc, sync_error, pps,
msg = towire_status_peer_error(NULL, channel_id, desc, soft_error, pps,
NULL);
peer_billboard(true, "Received error from peer: %s", desc);
peer_fatal_continue(take(msg), pps);
Expand Down
4 changes: 3 additions & 1 deletion common/peer_failed.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ void peer_failed(struct per_peer_state *pps,
* channel_id means all channels. */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
const struct channel_id *channel_id) NORETURN;
const struct channel_id *channel_id,
bool soft_error)
NORETURN;

/* I/O error */
void peer_failed_connection_lost(void) NORETURN;
Expand Down
4 changes: 3 additions & 1 deletion common/read_peer_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ bool handle_timestamp_filter(struct per_peer_state *pps, const u8 *msg TAKES)

bool handle_peer_gossip_or_error(struct per_peer_state *pps,
const struct channel_id *channel_id,
bool soft_error,
const u8 *msg TAKES)
{
char *err;
Expand Down Expand Up @@ -176,7 +177,8 @@ bool handle_peer_gossip_or_error(struct per_peer_state *pps,
if (err)
peer_failed_received_errmsg(pps, err,
all_channels
? NULL : channel_id);
? NULL : channel_id,
soft_error);

/* Ignore unknown channel errors. */
goto handled;
Expand Down
2 changes: 2 additions & 0 deletions common/read_peer_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
* handle_peer_gossip_or_error - simple handler for all the above cases.
* @pps: per-peer state.
* @channel_id: the channel id of the current channel.
* @soft_error: tell lightningd that incoming error is non-fatal.
* @msg: the peer message (only taken if returns true).
*
* This returns true if it handled the packet: a gossip packet (forwarded
Expand All @@ -66,6 +67,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
*/
bool handle_peer_gossip_or_error(struct per_peer_state *pps,
const struct channel_id *channel_id,
bool soft_error,
const u8 *msg TAKES);

/**
Expand Down
4 changes: 2 additions & 2 deletions openingd/openingd.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ static u8 *opening_negotiate_msg(const tal_t *ctx, struct state *state,
wire_sync_write(REQ_FD, take(msg));
}
peer_failed_received_errmsg(state->pps, err,
NULL);
NULL, false);
}
negotiation_aborted(state, am_funder,
tal_fmt(tmpctx, "They sent error %s",
Expand Down Expand Up @@ -1283,7 +1283,7 @@ static u8 *handle_peer_in(struct state *state)

/* Handles standard cases, and legal unknown ones. */
if (handle_peer_gossip_or_error(state->pps,
&state->channel_id, msg))
&state->channel_id, false, msg))
return NULL;

sync_crypto_write(state->pps,
Expand Down
10 changes: 6 additions & 4 deletions tests/test_closing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1608,7 +1608,7 @@ def test_shutdown(node_factory):

@flaky
@unittest.skipIf(not DEVELOPER, "needs to set upfront_shutdown_script")
def test_option_upfront_shutdown_script(node_factory, bitcoind):
def test_option_upfront_shutdown_script(node_factory, bitcoind, executor):
l1 = node_factory.get_node(start=False)
# Insist on upfront script we're not going to match.
l1.daemon.env["DEV_OPENINGD_UPFRONT_SHUTDOWN_SCRIPT"] = "76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac"
Expand All @@ -1618,14 +1618,16 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
l1.fund_channel(l2, 1000000, False)

l1.rpc.close(l2.info['id'])
# This will block, as l12 will send an error but l2 will retry.
fut = executor.submit(l1.rpc.close, l2.info['id'])

# l2 will close unilaterally when it dislikes shutdown script.
l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')

# Clear channel.
wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)
bitcoind.generate_block(1)
fut.result(TIMEOUT)
wait_for(lambda: [c['state'] for c in only_one(l1.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])
wait_for(lambda: [c['state'] for c in only_one(l2.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])

Expand All @@ -1636,7 +1638,7 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
l2.rpc.close(l1.info['id'])

# l2 will close unilaterally when it dislikes shutdown script.
l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')

# Clear channel.
wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)
Expand Down
12 changes: 9 additions & 3 deletions tests/test_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,7 @@ def test_funding_by_utxos(node_factory, bitcoind):
l1.rpc.fundchannel(l3.info["id"], int(0.01 * 10**8), utxos=utxos)


@unittest.skipIf(not DEVELOPER, "needs dev_forget_channel")
def test_funding_external_wallet_corners(node_factory, bitcoind):
l1 = node_factory.get_node()
l2 = node_factory.get_node()
Expand Down Expand Up @@ -1000,6 +1001,9 @@ def test_funding_external_wallet_corners(node_factory, bitcoind):
# Canceld channel after fundchannel_complete
assert l1.rpc.fundchannel_cancel(l2.info['id'])['cancelled']

# l2 won't give up, since it considers error "soft".
l2.rpc.dev_forget_channel(l1.info['id'])

l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
l1.rpc.fundchannel_start(l2.info['id'], amount)['funding_address']
assert l1.rpc.fundchannel_complete(l2.info['id'], prep['txid'], txout)['commitments_secured']
Expand Down Expand Up @@ -1431,19 +1435,21 @@ def test_update_fee(node_factory, bitcoind):


@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
def test_fee_limits(node_factory):
def test_fee_limits(node_factory, bitcoind):
l1, l2 = node_factory.line_graph(2, opts={'dev-max-fee-multiplier': 5, 'may_reconnect': True}, fundchannel=True)

# L1 asks for stupid low fee (will actually hit the floor of 253)
l1.stop()
l1.set_feerates((15, 15, 15), False)
l1.start()

l1.daemon.wait_for_log('Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee 253 outside range 1875-75000')
l1.daemon.wait_for_log('Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee 253 outside range 1875-75000')
# Make sure the resolution of this one doesn't interfere with the next!
# Note: may succeed, may fail with insufficient fee, depending on how
# bitcoind feels!
l1.daemon.wait_for_log('sendrawtx exit')
l2.daemon.wait_for_log('sendrawtx exit')
bitcoind.generate_block(1)
sync_blockheight(bitcoind, [l1, l2])

# Trying to open a channel with too low a fee-rate is denied
l4 = node_factory.get_node()
Expand Down
10 changes: 7 additions & 3 deletions tests/test_pay.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,20 @@ def test_pay_disconnect(node_factory, bitcoind):
l1.set_feerates((10**6, 1000**6, 1000**6), False)

# Wait for l1 notice
l1.daemon.wait_for_log(r'Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee \d+ outside range 1875-75000')
l1.daemon.wait_for_log(r'Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee \d+ outside range 1875-75000')

# l2 fails hard.
l2.daemon.wait_for_log('sendrawtx exit')
bitcoind.generate_block(1, wait_for_mempool=1)
sync_blockheight(bitcoind, [l1, l2])

# Should fail due to permenant channel fail
with pytest.raises(RpcError, match=r'failed: WIRE_UNKNOWN_NEXT_PEER \(First peer not ready\)'):
with pytest.raises(RpcError, match=r'WIRE_UNKNOWN_NEXT_PEER'):
l1.rpc.sendpay(route, rhash)

assert not l1.daemon.is_in_log('Payment is still in progress')

# After it sees block, someone should close channel.
bitcoind.generate_block(1)
l1.daemon.wait_for_log('ONCHAIN')


Expand Down

0 comments on commit 1d0c433

Please sign in to comment.