From 16353241a86438bef2673fa6ce58e7eab67b2d8b Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Tue, 28 Sep 2021 11:52:23 -0400 Subject: [PATCH] Handle a lost standalone ack correctly. (#9895) * Handle a lost standalone ack correctly. If a standalone ack gets lost and then there is an application-level reply to the message that triggered the standalone ack, we want to piggyback an ack on that application-level reply as well. Otherwise the other side can end up in a state where it has two outstanding unacknowledged messages (the message we are replying to, and the reply to our reply), which it can't handle. The other change here is to fix ReliableMessageMgr to not crash if the peer misbehaves and skips sending that piggyback ack described above. Instead, we just error out from sending the response to the peer's broken message. Fixes https://github.com/project-chip/connectedhomeip/issues/9796 * Address review comments --- src/messaging/ExchangeMessageDispatch.cpp | 2 +- src/messaging/ReliableMessageContext.cpp | 18 +++ src/messaging/ReliableMessageContext.h | 40 +++-- src/messaging/ReliableMessageMgr.cpp | 11 +- .../tests/TestReliableMessageProtocol.cpp | 147 +++++++++++++++++- 5 files changed, 197 insertions(+), 21 deletions(-) diff --git a/src/messaging/ExchangeMessageDispatch.cpp b/src/messaging/ExchangeMessageDispatch.cpp index 11707f895a3099..d683b10f47b9b0 100644 --- a/src/messaging/ExchangeMessageDispatch.cpp +++ b/src/messaging/ExchangeMessageDispatch.cpp @@ -50,7 +50,7 @@ CHIP_ERROR ExchangeMessageDispatch::SendMessage(SessionHandle session, uint16_t payloadHeader.SetExchangeID(exchangeId).SetMessageType(protocol, type).SetInitiator(isInitiator); // If there is a pending acknowledgment piggyback it on this message. - if (reliableMessageContext->IsAckPending()) + if (reliableMessageContext->HasPiggybackAckPending()) { payloadHeader.SetAckMessageCounter(reliableMessageContext->TakePendingPeerAckMessageCounter()); diff --git a/src/messaging/ReliableMessageContext.cpp b/src/messaging/ReliableMessageContext.cpp index cedd923495b062..fe45f08052b73c 100644 --- a/src/messaging/ReliableMessageContext.cpp +++ b/src/messaging/ReliableMessageContext.cpp @@ -136,6 +136,11 @@ CHIP_ERROR ReliableMessageContext::FlushAcks() return err; } +bool ReliableMessageContext::HasPiggybackAckPending() const +{ + return mFlags.Has(Flags::kFlagAckMessageCounterIsValid); +} + uint64_t ReliableMessageContext::GetInitialRetransmitTimeoutTick() { return mConfig.mInitialRetransTimeoutTick; @@ -213,6 +218,8 @@ CHIP_ERROR ReliableMessageContext::HandleNeedsAckInner(uint32_t messageCounter, // Is there pending ack for a different message counter. bool wasAckPending = IsAckPending() && mPendingPeerAckMessageCounter != messageCounter; + bool messageCounterWasValid = HasPiggybackAckPending(); + // Temporary store currently pending ack message counter (even if there is none). uint32_t tempAckMessageCounter = mPendingPeerAckMessageCounter; @@ -228,6 +235,16 @@ CHIP_ERROR ReliableMessageContext::HandleNeedsAckInner(uint32_t messageCounter, // Restore previously pending ack message counter. SetPendingPeerAckMessageCounter(tempAckMessageCounter); } + else if (messageCounterWasValid) + { + // Restore the previous value, so later piggybacks will pick it up, + // but don't set out "ack is pending" state, because we didn't use + // to have it set. + mPendingPeerAckMessageCounter = tempAckMessageCounter; + } + // Otherwise don't restore the invalid old mPendingPeerAckMessageCounter + // value, so we preserve the invariant that once we have had an ack + // pending we always have a valid mPendingPeerAckMessageCounter. return err; } @@ -296,6 +313,7 @@ void ReliableMessageContext::SetPendingPeerAckMessageCounter(uint32_t aPeerAckMe { mPendingPeerAckMessageCounter = aPeerAckMessageCounter; SetAckPending(true); + mFlags.Set(Flags::kFlagAckMessageCounterIsValid); } } // namespace Messaging diff --git a/src/messaging/ReliableMessageContext.h b/src/messaging/ReliableMessageContext.h index 410d1d01173bcb..606d7c781116c7 100644 --- a/src/messaging/ReliableMessageContext.h +++ b/src/messaging/ReliableMessageContext.h @@ -58,9 +58,10 @@ class ReliableMessageContext CHIP_ERROR FlushAcks(); /** - * Take the pending peer ack message counter from the context. This must only be called - * when IsAckPending() is true. After this call, IsAckPending() will be - * false; it's the caller's responsibility to send the ack. + * Take the pending peer ack message counter from the context. This must + * only be called when HasPiggybackAckPending() is true. After this call, + * IsAckPending() will be false; it's the caller's responsibility to send + * the ack. */ uint32_t TakePendingPeerAckMessageCounter() { @@ -68,6 +69,13 @@ class ReliableMessageContext return mPendingPeerAckMessageCounter; } + /** + * Check whether we have an ack to piggyback on the message we are sending. + * If true, TakePendingPeerAckMessageCounter will return a valid value that + * should be included as an ack in the message. + */ + bool HasPiggybackAckPending() const; + /** * Get the initial retransmission interval. It would be the time to wait before * retransmission after first failure. @@ -183,33 +191,39 @@ class ReliableMessageContext enum class Flags : uint16_t { /// When set, signifies that this context is the initiator of the exchange. - kFlagInitiator = 0x0001, + kFlagInitiator = (1u << 0), /// When set, signifies that a response is expected for a message that is being sent. - kFlagResponseExpected = 0x0002, + kFlagResponseExpected = (1u << 1), /// When set, automatically request an acknowledgment whenever a message is sent via UDP. - kFlagAutoRequestAck = 0x0004, + kFlagAutoRequestAck = (1u << 2), /// Internal and debug only: when set, the exchange layer does not send an acknowledgment. - kFlagDropAckDebug = 0x0008, + kFlagDropAckDebug = (1u << 3), /// When set, signifies current reliable message context is in usage. - kFlagOccupied = 0x0010, + kFlagOccupied = (1u << 4), /// When set, signifies that there is an acknowledgment pending to be sent back. - kFlagAckPending = 0x0020, + kFlagAckPending = (1u << 5), + + /// When set, signifies that there has once been an acknowledgment + /// pending to be sent back. In that case, + /// mPendingPeerAckMessageCounter is a valid message counter value for + /// some message we have needed to acknowledge in the past. + kFlagAckMessageCounterIsValid = (1u << 6), /// When set, signifies that at least one message has been received from peer on this exchange context. - kFlagMsgRcvdFromPeer = 0x0040, + kFlagMsgRcvdFromPeer = (1u << 7), /// When set, signifies that this exchange is waiting for a call to SendMessage. - kFlagWillSendMessage = 0x0080, + kFlagWillSendMessage = (1u << 8), /// When set, signifies that we are currently in the middle of HandleMessage. - kFlagHandlingMessage = 0x0100, + kFlagHandlingMessage = (1u << 9), /// When set, we have had Close() or Abort() called on us already. - kFlagClosed = 0x0200, + kFlagClosed = (1u << 10), }; BitFlags mFlags; // Internal state flags diff --git a/src/messaging/ReliableMessageMgr.cpp b/src/messaging/ReliableMessageMgr.cpp index 98a2a21985c34b..d8040b9fbef269 100644 --- a/src/messaging/ReliableMessageMgr.cpp +++ b/src/messaging/ReliableMessageMgr.cpp @@ -268,7 +268,16 @@ CHIP_ERROR ReliableMessageMgr::AddToRetransTable(ReliableMessageContext * rc, Re bool added = false; CHIP_ERROR err = CHIP_NO_ERROR; - VerifyOrDie(rc != nullptr && !rc->IsOccupied()); + VerifyOrDie(rc != nullptr); + + if (rc->IsOccupied()) + { + // This can happen if we have a misbehaving peer that is not sending + // acks with its application-level responses when it should, so we end + // up with two outstanding app-level messages both waiting for an ack. + // Just give up and error out in that case. + return CHIP_ERROR_INCORRECT_STATE; + } for (RetransTableEntry & entry : mRetransTable) { diff --git a/src/messaging/tests/TestReliableMessageProtocol.cpp b/src/messaging/tests/TestReliableMessageProtocol.cpp index a3989259e99e9e..6d4f864dee5a8f 100644 --- a/src/messaging/tests/TestReliableMessageProtocol.cpp +++ b/src/messaging/tests/TestReliableMessageProtocol.cpp @@ -76,7 +76,7 @@ class MockAppDelegate : public ExchangeDelegate if (mDropAckResponse) { auto * rc = ec->GetReliableMessageContext(); - if (rc->IsAckPending()) + if (rc->HasPiggybackAckPending()) { (void) rc->TakePendingPeerAckMessageCounter(); } @@ -798,7 +798,7 @@ void CheckReceiveAfterStandaloneAck(nlTestSuite * inSuite, void * inContext) NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0); } -void CheckNoPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext) +void CheckPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext) { TestContext & ctx = *reinterpret_cast(inContext); @@ -912,10 +912,9 @@ void CheckNoPiggybackAfterPiggyback(nlTestSuite * inSuite, void * inContext) NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 5); NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0); - // Ensure that we have received that response and it did not have a piggyback - // ack. + // Ensure that we have received that response and it had a piggyback ack. NL_TEST_ASSERT(inSuite, mockSender.IsOnMessageReceivedCalled); - NL_TEST_ASSERT(inSuite, !mockSender.mReceivedPiggybackAck); + NL_TEST_ASSERT(inSuite, mockSender.mReceivedPiggybackAck); err = ctx.GetExchangeManager().UnregisterUnsolicitedMessageHandlerForType(Echo::MsgType::EchoRequest); NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR); @@ -1262,6 +1261,141 @@ void CheckLostResponseWithPiggyback(nlTestSuite * inSuite, void * inContext) NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0); } +void CheckLostStandaloneAck(nlTestSuite * inSuite, void * inContext) +{ + /** + * This tests the following scenario: + * 1) A reliable message is sent from initiator to responder. + * 2) The responder sends a standalone ack, which is lost. + * 3) The responder sends an application-level response. + * 4) The initiator sends a reliable response to the app-level response. + * + * This should succeed, with all application-level messages being delivered + * and no crashes. + */ + TestContext & ctx = *reinterpret_cast(inContext); + + chip::System::PacketBufferHandle buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD)); + NL_TEST_ASSERT(inSuite, !buffer.IsNull()); + + CHIP_ERROR err = CHIP_NO_ERROR; + + MockAppDelegate mockReceiver; + err = ctx.GetExchangeManager().RegisterUnsolicitedMessageHandlerForType(Echo::MsgType::EchoRequest, &mockReceiver); + NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR); + + mockReceiver.mTestSuite = inSuite; + + MockAppDelegate mockSender; + ExchangeContext * exchange = ctx.NewExchangeToAlice(&mockSender); + NL_TEST_ASSERT(inSuite, exchange != nullptr); + + mockSender.mTestSuite = inSuite; + + ReliableMessageMgr * rm = ctx.GetExchangeManager().GetReliableMessageMgr(); + NL_TEST_ASSERT(inSuite, rm != nullptr); + + // Ensure the retransmit table is empty right now + NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0); + + // We send a message, the other side sends a standalone ack first (which is + // lost), then an application response, then we respond to that response. + // We need to keep both exchanges alive for that (so we can send the + // response from the receiver and so the initial sender exchange can send a + // response to that). + gLoopback.mSentMessageCount = 0; + gLoopback.mNumMessagesToDrop = 0; + gLoopback.mDroppedMessageCount = 0; + mockReceiver.mRetainExchange = true; + mockSender.mRetainExchange = true; + + // And ensure the ack heading back our way is dropped. + mockReceiver.mDropAckResponse = true; + + err = exchange->SendMessage(Echo::MsgType::EchoRequest, std::move(buffer), SendFlags(SendMessageFlags::kExpectResponse)); + NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR); + + // Ensure the message was sent. + NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 1); + NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0); + + // And that it was received. + NL_TEST_ASSERT(inSuite, mockReceiver.IsOnMessageReceivedCalled); + + // And that we have not gotten any app-level responses or acks so far. + NL_TEST_ASSERT(inSuite, !mockSender.IsOnMessageReceivedCalled); + NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); + + ReliableMessageContext * receiverRc = mockReceiver.mExchange->GetReliableMessageContext(); + // Ack should have been dropped. + NL_TEST_ASSERT(inSuite, !receiverRc->IsAckPending()); + + // Don't drop any more acks. + mockReceiver.mDropAckResponse = false; + + // Now send a message from the other side. + buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD)); + NL_TEST_ASSERT(inSuite, !buffer.IsNull()); + + mockReceiver.mExchange->SendMessage(Echo::MsgType::EchoResponse, std::move(buffer), + SendFlags(SendMessageFlags::kExpectResponse)); + + // Ensure the response was sent. + NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 2); + NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0); + + // Ensure that we have received that response and had a piggyback ack. + NL_TEST_ASSERT(inSuite, mockSender.IsOnMessageReceivedCalled); + NL_TEST_ASSERT(inSuite, mockSender.mReceivedPiggybackAck); + // We now have just the received message waiting for an ack. + NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); + + // Reset various state so we can measure things again. + mockReceiver.IsOnMessageReceivedCalled = false; + mockReceiver.mReceivedPiggybackAck = false; + mockSender.IsOnMessageReceivedCalled = false; + mockSender.mReceivedPiggybackAck = false; + + // Stop retaining the recipient exchange. + mockReceiver.mRetainExchange = false; + + // Now send a new message to the other side. + buffer = chip::MessagePacketBuffer::NewWithData(PAYLOAD, sizeof(PAYLOAD)); + NL_TEST_ASSERT(inSuite, !buffer.IsNull()); + + err = exchange->SendMessage(Echo::MsgType::EchoRequest, std::move(buffer)); + NL_TEST_ASSERT(inSuite, err == CHIP_NO_ERROR); + + // Ensure the message and the standalone ack to it were sent. + NL_TEST_ASSERT(inSuite, gLoopback.mSentMessageCount == 4); + NL_TEST_ASSERT(inSuite, gLoopback.mDroppedMessageCount == 0); + + // And that it was received. + NL_TEST_ASSERT(inSuite, mockReceiver.IsOnMessageReceivedCalled); + NL_TEST_ASSERT(inSuite, mockReceiver.mReceivedPiggybackAck); + + // And that receiver has no ack pending. + NL_TEST_ASSERT(inSuite, !receiverRc->IsAckPending()); + + // And that there are no un-acked messages left. + NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 0); +} + +/** + * TODO: A test that we should have but can't write with the existing + * infrastructure we have: + * + * 1. A sends message 1 to B + * 2. B is slow to respond, A does a resend and the resend is delayed in the network. + * 3. B responds with message 2, which acks message 1. + * 4. A sends message 3 to B + * 5. B sends standalone ack to message 3, which is lost + * 6. The duplicate message from step 3 is delivered and triggers a standalone ack. + * 7. B responds with message 4, which should carry a piggyback ack for message 3 + * (this is the part that needs testing!) + * 8. A sends message 5 to B. + */ + // Test Suite /** @@ -1280,12 +1414,13 @@ const nlTest sTests[] = NL_TEST_DEF("Test ReliableMessageMgr::CheckDuplicateMessage", CheckDuplicateMessage), NL_TEST_DEF("Test ReliableMessageMgr::CheckDuplicateMessageClosedExchange", CheckDuplicateMessageClosedExchange), NL_TEST_DEF("Test that a reply after a standalone ack comes through correctly", CheckReceiveAfterStandaloneAck), - NL_TEST_DEF("Test that a reply to a non-MRP message does not piggyback an ack even if there were MRP things happening on the context before", CheckNoPiggybackAfterPiggyback), + NL_TEST_DEF("Test that a reply to a non-MRP message piggybacks an ack if there were MRP things happening on the context before", CheckPiggybackAfterPiggyback), NL_TEST_DEF("Test sending an unsolicited ack-soliciting 'standalone ack' message", CheckSendUnsolicitedStandaloneAckMessage), NL_TEST_DEF("Test ReliableMessageMgr::CheckSendStandaloneAckMessage", CheckSendStandaloneAckMessage), NL_TEST_DEF("Test command, response, default response, with receiver closing exchange after sending response", CheckMessageAfterClosed), NL_TEST_DEF("Test that unencrypted message is dropped if exchange requires encryption", CheckUnencryptedMessageReceiveFailure), NL_TEST_DEF("Test that dropping an application-level message with a piggyback ack works ok once both sides retransmit", CheckLostResponseWithPiggyback), + NL_TEST_DEF("Test that an application-level response-to-response after a lost standalone ack to the initial message works", CheckLostStandaloneAck), NL_TEST_SENTINEL() };