From 34014621b59c044e6b2593de4ad59b3f04417740 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Fri, 7 Apr 2023 18:02:11 -0400 Subject: [PATCH] Add operational resolution retries in OperationalSessionSetup. (#25991) Apparently some border routers are broken and don't actually do the "multicast" part of mDNS right, leading to resolve failures if the advertisement is not available during the initial unicast-response probe. Work around that by retrying operational discovery a few times during commissioning. --- src/app/OperationalSessionSetup.cpp | 59 ++++++++++++++++++++++++++--- src/app/OperationalSessionSetup.h | 8 ++++ 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 2e9d572a18618e..f0cbece56cffae 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -442,6 +442,10 @@ CHIP_ERROR OperationalSessionSetup::LookupPeerAddress() { ++mAttemptsDone; } + if (mResolveAttemptsAllowed > 0) + { + --mResolveAttemptsAllowed; + } #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES // NOTE: This is public API that can be used to update our stored peer @@ -502,9 +506,44 @@ void OperationalSessionSetup::OnNodeAddressResolutionFailed(const PeerId & peerI ChipLogError(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: operational discovery failed: %" CHIP_ERROR_FORMAT, mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), reason.Format()); - // Does it make sense to ScheduleSessionSetupReattempt() here? DNS-SD - // resolution has its own retry/backoff mechanisms, so if it's failed we - // have already done a lot of that. +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // If we're in a mode where we would generally retry CASE, retry operational + // discovery once. That allows us to more-gracefully handle broken networks + // where multicast DNS does not actually work and hence only the initial + // unicast DNS-SD queries get a response. + // + // We check for State::ResolvingAddress just in case in the meantime + // something weird happened and we are no longer trying to resolve an + // address. + if (mState == State::ResolvingAddress && mResolveAttemptsAllowed > 0) + { + ChipLogProgress(Discovery, "Retrying operational DNS-SD discovery. Attempts remaining: %u", mResolveAttemptsAllowed); + + // Pretend like our previous attempt (i.e. call to LookupPeerAddress) + // has not happened for purposes of the generic attempt counters, so we + // don't mess up the counters for our actual CASE retry logic. + if (mRemainingAttempts < UINT8_MAX) + { + ++mRemainingAttempts; + } + if (mAttemptsDone > 0) + { + --mAttemptsDone; + } + + CHIP_ERROR err = LookupPeerAddress(); + if (err == CHIP_NO_ERROR) + { + // We need to notify our consumer that the resolve will take more + // time, but we don't actually know how much time it will take, + // because the resolver does not expose that information. Just use + // one minute to be safe. + using namespace chip::System::Clock::Literals; + NotifyRetryHandlers(reason, 60_s16); + return; + } + } +#endif // No need to modify any variables in `this` since call below releases `this`. DequeueConnectionCallbacks(reason); @@ -531,6 +570,11 @@ void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount) { mRemainingAttempts = attemptCount; } + + if (attemptCount > mResolveAttemptsAllowed) + { + mResolveAttemptsAllowed = attemptCount; + } } CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay) @@ -619,11 +663,16 @@ void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, const Reliab System::Clock::Timeout messageTimeout = CASESession::ComputeSigma1ResponseTimeout(remoteMrpConfig); auto timeoutSecs = std::chrono::duration_cast(messageTimeout); // Add 1 second in case we had fractional milliseconds in messageTimeout. - timeoutSecs += System::Clock::Seconds16(1); + using namespace chip::System::Clock::Literals; + NotifyRetryHandlers(error, timeoutSecs + 1_s16 + retryDelay); +} + +void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate) +{ for (auto * item = mConnectionRetry.First(); item && item != &mConnectionRetry; item = item->mNext) { auto cb = Callback::Callback::FromCancelable(item); - cb->mCall(cb->mContext, mPeerId, error, timeoutSecs + retryDelay); + cb->mCall(cb->mContext, mPeerId, error, timeoutEstimate); } } #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index 60f0f0ecaba5b7..e7522d5d62e978 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -286,6 +286,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, uint8_t mRemainingAttempts = 0; uint8_t mAttemptsDone = 0; + uint8_t mResolveAttemptsAllowed = 0; + Callback::CallbackDeque mConnectionRetry; #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES @@ -354,6 +356,12 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, */ void NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig, System::Clock::Seconds16 retryDelay); + + /** + * A version of NotifyRetryHandlers that passes in a retry timeout estimate + * directly. + */ + void NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate); #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES };