Skip to content

Commit

Permalink
Handle a lost standalone ack correctly. (project-chip#9895)
Browse files Browse the repository at this point in the history
* Handle a lost standalone ack correctly.

If a standalone ack gets lost and then there is an application-level
reply to the message that triggered the standalone ack, we want to
piggyback an ack on that application-level reply as well.  Otherwise
the other side can end up in a state where it has two outstanding
unacknowledged messages (the message we are replying to, and the reply
to our reply), which it can't handle.

The other change here is to fix ReliableMessageMgr to not crash if the
peer misbehaves and skips sending that piggyback ack described above.
Instead, we just error out from sending the response to the peer's
broken message.

Fixes project-chip#9796

* Address review comments
  • Loading branch information
bzbarsky-apple authored Sep 28, 2021
1 parent a3af3ac commit fa43d00
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/messaging/ExchangeMessageDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ CHIP_ERROR ExchangeMessageDispatch::SendMessage(SessionHandle session, uint16_t
payloadHeader.SetExchangeID(exchangeId).SetMessageType(protocol, type).SetInitiator(isInitiator);

// If there is a pending acknowledgment piggyback it on this message.
if (reliableMessageContext->IsAckPending())
if (reliableMessageContext->HasPiggybackAckPending())
{
payloadHeader.SetAckMessageCounter(reliableMessageContext->TakePendingPeerAckMessageCounter());

Expand Down
18 changes: 18 additions & 0 deletions src/messaging/ReliableMessageContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ CHIP_ERROR ReliableMessageContext::FlushAcks()
return err;
}

bool ReliableMessageContext::HasPiggybackAckPending() const
{
return mFlags.Has(Flags::kFlagAckMessageCounterIsValid);
}

uint64_t ReliableMessageContext::GetInitialRetransmitTimeoutTick()
{
return mConfig.mInitialRetransTimeoutTick;
Expand Down Expand Up @@ -213,6 +218,8 @@ CHIP_ERROR ReliableMessageContext::HandleNeedsAckInner(uint32_t messageCounter,
// Is there pending ack for a different message counter.
bool wasAckPending = IsAckPending() && mPendingPeerAckMessageCounter != messageCounter;

bool messageCounterWasValid = HasPiggybackAckPending();

// Temporary store currently pending ack message counter (even if there is none).
uint32_t tempAckMessageCounter = mPendingPeerAckMessageCounter;

Expand All @@ -228,6 +235,16 @@ CHIP_ERROR ReliableMessageContext::HandleNeedsAckInner(uint32_t messageCounter,
// Restore previously pending ack message counter.
SetPendingPeerAckMessageCounter(tempAckMessageCounter);
}
else if (messageCounterWasValid)
{
// Restore the previous value, so later piggybacks will pick it up,
// but don't set out "ack is pending" state, because we didn't use
// to have it set.
mPendingPeerAckMessageCounter = tempAckMessageCounter;
}
// Otherwise don't restore the invalid old mPendingPeerAckMessageCounter
// value, so we preserve the invariant that once we have had an ack
// pending we always have a valid mPendingPeerAckMessageCounter.

return err;
}
Expand Down Expand Up @@ -296,6 +313,7 @@ void ReliableMessageContext::SetPendingPeerAckMessageCounter(uint32_t aPeerAckMe
{
mPendingPeerAckMessageCounter = aPeerAckMessageCounter;
SetAckPending(true);
mFlags.Set(Flags::kFlagAckMessageCounterIsValid);
}

} // namespace Messaging
Expand Down
40 changes: 27 additions & 13 deletions src/messaging/ReliableMessageContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,24 @@ class ReliableMessageContext
CHIP_ERROR FlushAcks();

/**
* Take the pending peer ack message counter from the context. This must only be called
* when IsAckPending() is true. After this call, IsAckPending() will be
* false; it's the caller's responsibility to send the ack.
* Take the pending peer ack message counter from the context. This must
* only be called when HasPiggybackAckPending() is true. After this call,
* IsAckPending() will be false; it's the caller's responsibility to send
* the ack.
*/
uint32_t TakePendingPeerAckMessageCounter()
{
SetAckPending(false);
return mPendingPeerAckMessageCounter;
}

/**
* Check whether we have an ack to piggyback on the message we are sending.
* If true, TakePendingPeerAckMessageCounter will return a valid value that
* should be included as an ack in the message.
*/
bool HasPiggybackAckPending() const;

/**
* Get the initial retransmission interval. It would be the time to wait before
* retransmission after first failure.
Expand Down Expand Up @@ -183,33 +191,39 @@ class ReliableMessageContext
enum class Flags : uint16_t
{
/// When set, signifies that this context is the initiator of the exchange.
kFlagInitiator = 0x0001,
kFlagInitiator = (1u << 0),

/// When set, signifies that a response is expected for a message that is being sent.
kFlagResponseExpected = 0x0002,
kFlagResponseExpected = (1u << 1),

/// When set, automatically request an acknowledgment whenever a message is sent via UDP.
kFlagAutoRequestAck = 0x0004,
kFlagAutoRequestAck = (1u << 2),

/// Internal and debug only: when set, the exchange layer does not send an acknowledgment.
kFlagDropAckDebug = 0x0008,
kFlagDropAckDebug = (1u << 3),

/// When set, signifies current reliable message context is in usage.
kFlagOccupied = 0x0010,
kFlagOccupied = (1u << 4),

/// When set, signifies that there is an acknowledgment pending to be sent back.
kFlagAckPending = 0x0020,
kFlagAckPending = (1u << 5),

/// When set, signifies that there has once been an acknowledgment
/// pending to be sent back. In that case,
/// mPendingPeerAckMessageCounter is a valid message counter value for
/// some message we have needed to acknowledge in the past.
kFlagAckMessageCounterIsValid = (1u << 6),

/// When set, signifies that at least one message has been received from peer on this exchange context.
kFlagMsgRcvdFromPeer = 0x0040,
kFlagMsgRcvdFromPeer = (1u << 7),

/// When set, signifies that this exchange is waiting for a call to SendMessage.
kFlagWillSendMessage = 0x0080,
kFlagWillSendMessage = (1u << 8),

/// When set, signifies that we are currently in the middle of HandleMessage.
kFlagHandlingMessage = 0x0100,
kFlagHandlingMessage = (1u << 9),
/// When set, we have had Close() or Abort() called on us already.
kFlagClosed = 0x0200,
kFlagClosed = (1u << 10),
};

BitFlags<Flags> mFlags; // Internal state flags
Expand Down
11 changes: 10 additions & 1 deletion src/messaging/ReliableMessageMgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,16 @@ CHIP_ERROR ReliableMessageMgr::AddToRetransTable(ReliableMessageContext * rc, Re
bool added = false;
CHIP_ERROR err = CHIP_NO_ERROR;

VerifyOrDie(rc != nullptr && !rc->IsOccupied());
VerifyOrDie(rc != nullptr);

if (rc->IsOccupied())
{
// This can happen if we have a misbehaving peer that is not sending
// acks with its application-level responses when it should, so we end
// up with two outstanding app-level messages both waiting for an ack.
// Just give up and error out in that case.
return CHIP_ERROR_INCORRECT_STATE;
}

for (RetransTableEntry & entry : mRetransTable)
{
Expand Down
147 changes: 141 additions & 6 deletions src/messaging/tests/TestReliableMessageProtocol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class MockAppDelegate : public ExchangeDelegate
if (mDropAckResponse)
{
auto * rc = ec->GetReliableMessageContext();
if (rc->IsAckPending())
if (rc->HasPiggybackAckPending())
{
(void) rc->TakePendingPeerAckMessageCounter();
}
Expand Down Expand Up @@ -798,7 +798,7 @@ void CheckReceiveAfterStandaloneAck(nlTestSuite * inSuite, void * inContext)
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0);
}

void CheckNoPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext)
void CheckPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext)
{
TestContext & ctx = *reinterpret_cast<TestContext *>(inContext);

Expand Down Expand Up @@ -912,10 +912,9 @@ void CheckNoPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext)
NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 5);
NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0);

// Ensure that we have received that response and it did not have a piggyback
// ack.
// Ensure that we have received that response and it had a piggyback ack.
NL_TEST_ASSERT(inSuite, mockSender.IsOnMessageReceivedCalled);
NL_TEST_ASSERT(inSuite, !mockSender.mReceivedPiggybackAck);
NL_TEST_ASSERT(inSuite, mockSender.mReceivedPiggybackAck);

err = ctx.GetExchangeManager().UnregisterUnsolicitedMessageHandlerForType(Echo::MsgType::EchoRequest);
NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR);
Expand Down Expand Up @@ -1262,6 +1261,141 @@ void CheckLostResponseWithPiggyback(nlTestSuite * inSuite, void * inContext)
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0);
}

void CheckLostStandaloneAck(nlTestSuite * inSuite, void * inContext)
{
/**
* This tests the following scenario:
* 1) A reliable message is sent from initiator to responder.
* 2) The responder sends a standalone ack, which is lost.
* 3) The responder sends an application-level response.
* 4) The initiator sends a reliable response to the app-level response.
*
* This should succeed, with all application-level messages being delivered
* and no crashes.
*/
TestContext & ctx = *reinterpret_cast<TestContext *>(inContext);

chip::System::PacketBufferHandle buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD));
NL_TEST_ASSERT(inSuite, !buffer.IsNull());

CHIP_ERROR err = CHIP_NO_ERROR;

MockAppDelegate mockReceiver;
err = ctx.GetExchangeManager().RegisterUnsolicitedMessageHandlerForType(Echo::MsgType::EchoRequest, &mockReceiver);
NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR);

mockReceiver.mTestSuite = inSuite;

MockAppDelegate mockSender;
ExchangeContext * exchange = ctx.NewExchangeToAlice(&mockSender);
NL_TEST_ASSERT(inSuite, exchange != nullptr);

mockSender.mTestSuite = inSuite;

ReliableMessageMgr * rm = ctx.GetExchangeManager().GetReliableMessageMgr();
NL_TEST_ASSERT(inSuite, rm != nullptr);

// Ensure the retransmit table is empty right now
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0);

// We send a message, the other side sends a standalone ack first (which is
// lost), then an application response, then we respond to that response.
// We need to keep both exchanges alive for that (so we can send the
// response from the receiver and so the initial sender exchange can send a
// response to that).
gLoopback.mSentMessageCount = 0;
gLoopback.mNumMessagesToDrop = 0;
gLoopback.mDroppedMessageCount = 0;
mockReceiver.mRetainExchange = true;
mockSender.mRetainExchange = true;

// And ensure the ack heading back our way is dropped.
mockReceiver.mDropAckResponse = true;

err = exchange->SendMessage(Echo::MsgType::EchoRequest, std::move(buffer), SendFlags(SendMessageFlags::kExpectResponse));
NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR);

// Ensure the message was sent.
NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 1);
NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0);

// And that it was received.
NL_TEST_ASSERT(inSuite, mockReceiver.IsOnMessageReceivedCalled);

// And that we have not gotten any app-level responses or acks so far.
NL_TEST_ASSERT(inSuite, !mockSender.IsOnMessageReceivedCalled);
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

ReliableMessageContext * receiverRc = mockReceiver.mExchange->GetReliableMessageContext();
// Ack should have been dropped.
NL_TEST_ASSERT(inSuite, !receiverRc->IsAckPending());

// Don't drop any more acks.
mockReceiver.mDropAckResponse = false;

// Now send a message from the other side.
buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD));
NL_TEST_ASSERT(inSuite, !buffer.IsNull());

mockReceiver.mExchange->SendMessage(Echo::MsgType::EchoResponse, std::move(buffer),
SendFlags(SendMessageFlags::kExpectResponse));

// Ensure the response was sent.
NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 2);
NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0);

// Ensure that we have received that response and had a piggyback ack.
NL_TEST_ASSERT(inSuite, mockSender.IsOnMessageReceivedCalled);
NL_TEST_ASSERT(inSuite, mockSender.mReceivedPiggybackAck);
// We now have just the received message waiting for an ack.
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

// Reset various state so we can measure things again.
mockReceiver.IsOnMessageReceivedCalled = false;
mockReceiver.mReceivedPiggybackAck = false;
mockSender.IsOnMessageReceivedCalled = false;
mockSender.mReceivedPiggybackAck = false;

// Stop retaining the recipient exchange.
mockReceiver.mRetainExchange = false;

// Now send a new message to the other side.
buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD));
NL_TEST_ASSERT(inSuite, !buffer.IsNull());

err = exchange->SendMessage(Echo::MsgType::EchoRequest, std::move(buffer));
NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR);

// Ensure the message and the standalone ack to it were sent.
NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 4);
NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0);

// And that it was received.
NL_TEST_ASSERT(inSuite, mockReceiver.IsOnMessageReceivedCalled);
NL_TEST_ASSERT(inSuite, mockReceiver.mReceivedPiggybackAck);

// And that receiver has no ack pending.
NL_TEST_ASSERT(inSuite, !receiverRc->IsAckPending());

// And that there are no un-acked messages left.
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0);
}

/**
* TODO: A test that we should have but can't write with the existing
* infrastructure we have:
*
* 1. A sends message 1 to B
* 2. B is slow to respond, A does a resend and the resend is delayed in the network.
* 3. B responds with message 2, which acks message 1.
* 4. A sends message 3 to B
* 5. B sends standalone ack to message 3, which is lost
* 6. The duplicate message from step 3 is delivered and triggers a standalone ack.
* 7. B responds with message 4, which should carry a piggyback ack for message 3
* (this is the part that needs testing!)
* 8. A sends message 5 to B.
*/

// Test Suite

/**
Expand All @@ -1280,12 +1414,13 @@ const nlTest sTests[] =
NL_TEST_DEF("Test ReliableMessageMgr::CheckDuplicateMessage", CheckDuplicateMessage),
NL_TEST_DEF("Test ReliableMessageMgr::CheckDuplicateMessageClosedExchange", CheckDuplicateMessageClosedExchange),
NL_TEST_DEF("Test that a reply after a standalone ack comes through correctly", CheckReceiveAfterStandaloneAck),
NL_TEST_DEF("Test that a reply to a non-MRP message does not piggyback an ack even if there were MRP things happening on the context before", CheckNoPiggybackAfterPiggyback),
NL_TEST_DEF("Test that a reply to a non-MRP message piggybacks an ack if there were MRP things happening on the context before", CheckPiggybackAfterPiggyback),
NL_TEST_DEF("Test sending an unsolicited ack-soliciting 'standalone ack' message", CheckSendUnsolicitedStandaloneAckMessage),
NL_TEST_DEF("Test ReliableMessageMgr::CheckSendStandaloneAckMessage", CheckSendStandaloneAckMessage),
NL_TEST_DEF("Test command, response, default response, with receiver closing exchange after sending response", CheckMessageAfterClosed),
NL_TEST_DEF("Test that unencrypted message is dropped if exchange requires encryption", CheckUnencryptedMessageReceiveFailure),
NL_TEST_DEF("Test that dropping an application-level message with a piggyback ack works ok once both sides retransmit", CheckLostResponseWithPiggyback),
NL_TEST_DEF("Test that an application-level response-to-response after a lost standalone ack to the initial message works", CheckLostStandaloneAck),

NL_TEST_SENTINEL()
};
Expand Down

0 comments on commit fa43d00

Please sign in to comment.