From 3fea0ece26addef4517d543e84080eb555a2b99c Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Wed, 5 Apr 2023 16:48:35 -0400 Subject: [PATCH] Add operational resolution retries in OperationalSessionSetup. Apparently some border routers are broken and don't actually do the "multicast" part of mDNS right, leading to resolve failures if the advertisement is not available during the initial unicast-response probe. Work around that by retrying operational discovery a few times during commissioning. --- src/app/OperationalSessionSetup.cpp | 53 ++++++++++++++++++++++++++--- src/app/OperationalSessionSetup.h | 8 +++++ 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 2e9d572a18618e..ff5a523b2bda58 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -442,6 +442,9 @@ CHIP_ERROR OperationalSessionSetup::LookupPeerAddress() { ++mAttemptsDone; } + if (mResolveAttemptsAllowed > 0) { + --mResolveAttemptsAllowed; + } #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES // NOTE: This is public API that can be used to update our stored peer @@ -502,9 +505,40 @@ void OperationalSessionSetup::OnNodeAddressResolutionFailed(const PeerId & peerI ChipLogError(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: operational discovery failed: %" CHIP_ERROR_FORMAT, mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), reason.Format()); - // Does it make sense to ScheduleSessionSetupReattempt() here? DNS-SD - // resolution has its own retry/backoff mechanisms, so if it's failed we - // have already done a lot of that. +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // If we're in a mode where we would generally retry CASE, retry operational + // discovery once. That allows us to more-gracefully handle broken networks + // where multicast DNS does not actually work and hence only the initial + // unicast DNS-SD queries get a response. + // + // We check for State::ResolvingAddress just in case in the meantime + // something weird happened and we are no longer trying to resolve an + // address. + if (mState == State::ResolvingAddress && mResolveAttemptsAllowed > 0) { + ChipLogProgress(Discovery, "Retrying operational DNS-SD discovery. Attempts remaining: %u", mResolveAttemptsAllowed); + + // Pretend like our previous attempt (i.e. call to LookupPeerAddress) + // has not happened for purposes of the generic attempt counters, so we + // don't mess up the counters for our actual CASE retry logic. + if (mRemainingAttempts < UINT8_MAX) { + ++mRemainingAttempts; + } + if (mAttemptsDone > 0) { + --mAttemptsDone; + } + + CHIP_ERROR err = LookupPeerAddress(); + if (err == CHIP_NO_ERROR) { + // We need to notify our consumer that the resolve will take more + // time, but we don't actually know how much time it will take, + // because the resolver does not expose that information. Just use + // one minute to be safe. + using namespace chip::System::Clock::Literals; + NotifyRetryHandlers(reason, 60_s16); + return; + } + } +#endif // No need to modify any variables in `this` since call below releases `this`. DequeueConnectionCallbacks(reason); @@ -531,6 +565,10 @@ void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount) { mRemainingAttempts = attemptCount; } + + if (attemptCount > mResolveAttemptsAllowed) { + mResolveAttemptsAllowed = attemptCount; + } } CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay) @@ -619,11 +657,16 @@ void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, const Reliab System::Clock::Timeout messageTimeout = CASESession::ComputeSigma1ResponseTimeout(remoteMrpConfig); auto timeoutSecs = std::chrono::duration_cast(messageTimeout); // Add 1 second in case we had fractional milliseconds in messageTimeout. - timeoutSecs += System::Clock::Seconds16(1); + using namespace chip::System::Clock::Literals; + NotifyRetryHandlers(error, timeoutSecs + 1_s16 + retryDelay); +} + +void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate) +{ for (auto * item = mConnectionRetry.First(); item && item != &mConnectionRetry; item = item->mNext) { auto cb = Callback::Callback::FromCancelable(item); - cb->mCall(cb->mContext, mPeerId, error, timeoutSecs + retryDelay); + cb->mCall(cb->mContext, mPeerId, error, timeoutEstimate); } } #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index 60f0f0ecaba5b7..e7522d5d62e978 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -286,6 +286,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, uint8_t mRemainingAttempts = 0; uint8_t mAttemptsDone = 0; + uint8_t mResolveAttemptsAllowed = 0; + Callback::CallbackDeque mConnectionRetry; #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES @@ -354,6 +356,12 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, */ void NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig, System::Clock::Seconds16 retryDelay); + + /** + * A version of NotifyRetryHandlers that passes in a retry timeout estimate + * directly. + */ + void NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate); #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES };