Skip to content

Commit

Permalink
Properly handle commissioning error paths (#16882)
Browse files Browse the repository at this point in the history
* Don't re-establish PASE if we have a connection.

* Fix incorrect error return for pase with no pairing delegate

* Add commissioning status update callback

Also callback with full peer id for folks that want that.

* Disarm failsafe immediately if we get 0 timeout.

* Fix event handling in commissioning window manager.

The current failsafe expiry doesn't actually re-start the advertising
because the pase connection stops the commissioning window timer.

Instead, stop the commissining window timer when we get a commissioning
complete.

Don't use the OnSessionEstablishmentError function for failsafe
timer expiry because it sends a callback about pairing that does not
apply here.

* Fail re-send if the session is expired.

* Disarm failsafe and kill pase connection for early failures

* Clear event on BLE commissioning.

* Restyled by clang-format

* Restyled by autopep8

* Add ExpireAllPASEPairings function to session manager

Use in CommissioningWindowManager

* Restyled by clang-format

* Address review comments

* Boris' reliable transmission fix.

* Revert back to using > 0 for breadcrumb.

Most commands don't set it - the last one that did was the failsafe.

* manually cancel timer.

* Apply suggestions from code review

Co-authored-by: Boris Zbarsky <[email protected]>

* Add API documentation

* Move log line down.

* Restyled by clang-format

* Boris' fix for darwin tests.

Co-authored-by: Restyled.io <[email protected]>
Co-authored-by: Boris Zbarsky <[email protected]>
  • Loading branch information
3 people authored Apr 4, 2022
1 parent 0ba8ff3 commit c3b1810
Show file tree
Hide file tree
Showing 28 changed files with 969 additions and 46 deletions.
2 changes: 2 additions & 0 deletions scripts/tests/cirque_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ CIRQUE_TESTS=(
"CommissioningTest"
"InteractionModelTest"
"SplitCommissioningTest"
"CommissioningFailureTest"
"CommissioningFailureOnReportTest"
)

BOLD_GREEN_TEXT="\033[1;32m"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,17 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
if (!failSafeContext.IsFailSafeBusy() &&
(!failSafeContext.IsFailSafeArmed() || failSafeContext.MatchesFabricIndex(accessingFabricIndex)))
{
CheckSuccess(failSafeContext.ArmFailSafe(accessingFabricIndex, System::Clock::Seconds16(commandData.expiryLengthSeconds)),
Failure);
if (commandData.expiryLengthSeconds == 0)
{
// Force the timer to expire immediately.
failSafeContext.ForceFailSafeTimerExpiry();
}
else
{
CheckSuccess(
failSafeContext.ArmFailSafe(accessingFabricIndex, System::Clock::Seconds16(commandData.expiryLengthSeconds)),
Failure);
}
response.errorCode = CommissioningError::kOk;
commandObj->AddResponse(commandPath, response);
}
Expand Down
13 changes: 9 additions & 4 deletions src/app/server/CommissioningWindowManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,15 @@ void CommissioningWindowManager::OnPlatformEvent(const DeviceLayer::ChipDeviceEv
if (event->Type == DeviceLayer::DeviceEventType::kCommissioningComplete)
{
ChipLogProgress(AppServer, "Commissioning completed successfully");
DeviceLayer::SystemLayer().CancelTimer(HandleCommissioningWindowTimeout, this);
mCommissioningTimeoutTimerArmed = false;
Cleanup();
mServer->GetSecureSessionManager().ExpireAllPASEPairings();
}
else if (event->Type == DeviceLayer::DeviceEventType::kFailSafeTimerExpired)
{
ChipLogError(AppServer, "Failsafe timer expired");
OnSessionEstablishmentError(CHIP_ERROR_TIMEOUT);
HandleFailedAttempt(CHIP_ERROR_TIMEOUT);
}
else if (event->Type == DeviceLayer::DeviceEventType::kOperationalNetworkEnabled)
{
Expand Down Expand Up @@ -98,9 +101,13 @@ void CommissioningWindowManager::Cleanup()
void CommissioningWindowManager::OnSessionEstablishmentError(CHIP_ERROR err)
{
DeviceLayer::SystemLayer().CancelTimer(HandleSessionEstablishmentTimeout, this);
HandleFailedAttempt(err);
}

void CommissioningWindowManager::HandleFailedAttempt(CHIP_ERROR err)
{
mFailedCommissioningAttempts++;
ChipLogError(AppServer, "Commissioning failed (attempt %d): %s", mFailedCommissioningAttempts, ErrorStr(err));

#if CONFIG_NETWORK_LAYER_BLE
mServer->GetBleLayerObject()->CloseAllBleConnections();
#endif
Expand Down Expand Up @@ -134,8 +141,6 @@ void CommissioningWindowManager::OnSessionEstablishmentStarted()
void CommissioningWindowManager::OnSessionEstablished()
{
DeviceLayer::SystemLayer().CancelTimer(HandleSessionEstablishmentTimeout, this);
DeviceLayer::SystemLayer().CancelTimer(HandleCommissioningWindowTimeout, this);
mCommissioningTimeoutTimerArmed = false;
SessionHolder sessionHolder;
CHIP_ERROR err = mServer->GetSecureSessionManager().NewPairing(
sessionHolder, Optional<Transport::PeerAddress>::Value(mPairingSession.GetPeerAddress()), mPairingSession.GetPeerNodeId(),
Expand Down
5 changes: 5 additions & 0 deletions src/app/server/CommissioningWindowManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ class CommissioningWindowManager : public SessionEstablishmentDelegate, public a
// called when a commissioning window timeout timer is running.
CHIP_ERROR AdvertiseAndListenForPASE();

// Call AdvertiseAndListenForPASE, only if max attempts have not been reached.
// Cleans up and calls app server delegate on failure.
// err gives the current error we're attemping to recover from
void HandleFailedAttempt(CHIP_ERROR err);

// Helper for Shutdown and Cleanup. Does not do anything with
// advertisements, because Shutdown and Cleanup want to handle those
// differently.
Expand Down
40 changes: 27 additions & 13 deletions src/controller/AutoCommissioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ CommissioningStage AutoCommissioner::GetNextCommissioningStageInternal(Commissio
case CommissioningStage::kSecurePairing:
return CommissioningStage::kReadCommissioningInfo;
case CommissioningStage::kReadCommissioningInfo:
if (mDeviceCommissioningInfo.general.breadcrumb > 0)
{
// If the breadcrumb is 0, the failsafe was disarmed.
// We failed on network setup or later, the node failsafe has not been re-armed and the breadcrumb has not been reset.
// Per the spec, we restart from after adding the NOC.
return GetNextCommissioningStage(CommissioningStage::kSendNOC, lastErr);
}
return CommissioningStage::kArmFailsafe;
case CommissioningStage::kArmFailsafe:
return CommissioningStage::kConfigRegulatory;
Expand Down Expand Up @@ -314,25 +321,33 @@ CHIP_ERROR AutoCommissioner::NOCChainGenerated(ByteSpan noc, ByteSpan icac, Byte

CHIP_ERROR AutoCommissioner::CommissioningStepFinished(CHIP_ERROR err, CommissioningDelegate::CommissioningReport report)
{
CompletionStatus completionStatus;
completionStatus.err = err;
ChipLogProgress(Controller, "Finished commissioning step '%s' with error '%s'", StageToString(report.stageCompleted),
err.AsString());
if (err != CHIP_NO_ERROR)
{
completionStatus.failedStage = MakeOptional(report.stageCompleted);
ChipLogError(Controller, "Failed to perform commissioning step %d", static_cast<int>(report.stageCompleted));
if ((report.stageCompleted == CommissioningStage::kAttestationVerification) &&
((report.Get<AdditionalErrorInfo>().attestationResult ==
Credentials::AttestationVerificationResult::kDacProductIdMismatch) ||
(report.Get<AdditionalErrorInfo>().attestationResult ==
Credentials::AttestationVerificationResult::kDacVendorIdMismatch)))
if (report.stageCompleted == CommissioningStage::kAttestationVerification)
{
ChipLogError(Controller,
"Failed device attestation. Device vendor and/or product ID do not match the IDs expected. "
"Verify DAC certificate chain and certification declaration to ensure spec rules followed.");
if (report.Is<AdditionalErrorInfo>())
{
completionStatus.attestationResult = MakeOptional(report.Get<AdditionalErrorInfo>().attestationResult);
if ((report.Get<AdditionalErrorInfo>().attestationResult ==
Credentials::AttestationVerificationResult::kDacProductIdMismatch) ||
(report.Get<AdditionalErrorInfo>().attestationResult ==
Credentials::AttestationVerificationResult::kDacVendorIdMismatch))
{
ChipLogError(Controller,
"Failed device attestation. Device vendor and/or product ID do not match the IDs expected. "
"Verify DAC certificate chain and certification declaration to ensure spec rules followed.");
}
}
}
}
else
{
ChipLogProgress(Controller, "Finished commissioning step '%s' with error '%s'", StageToString(report.stageCompleted),
err.AsString());

switch (report.stageCompleted)
{
case CommissioningStage::kReadCommissioningInfo:
Expand Down Expand Up @@ -385,7 +400,6 @@ CHIP_ERROR AutoCommissioner::CommissioningStepFinished(CHIP_ERROR err, Commissio
ReleaseDAC();
mCommissioneeDeviceProxy = nullptr;
mOperationalDeviceProxy = nullptr;
mParams = CommissioningParameters();
mDeviceCommissioningInfo = ReadCommissioningInfo();
return CHIP_NO_ERROR;
default:
Expand All @@ -412,7 +426,7 @@ CHIP_ERROR AutoCommissioner::CommissioningStepFinished(CHIP_ERROR err, Commissio
return CHIP_ERROR_INCORRECT_STATE;
}

mParams.SetCompletionStatus(err);
mParams.SetCompletionStatus(completionStatus);
mCommissioner->PerformCommissioningStep(proxy, nextStage, mParams, this, GetEndpoint(nextStage), GetCommandTimeout(nextStage));
return CHIP_NO_ERROR;
}
Expand Down
7 changes: 5 additions & 2 deletions src/controller/AutoCommissioner.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ class AutoCommissioner : public CommissioningDelegate

CHIP_ERROR CommissioningStepFinished(CHIP_ERROR err, CommissioningDelegate::CommissioningReport report) override;

private:
protected:
CommissioningStage GetNextCommissioningStage(CommissioningStage currentStage, CHIP_ERROR & lastErr);
CommissioningStage GetNextCommissioningStageInternal(CommissioningStage currentStage, CHIP_ERROR & lastErr);
DeviceCommissioner * GetCommissioner() { return mCommissioner; }

private:
void ReleaseDAC();
void ReleasePAI();

Expand All @@ -52,6 +54,7 @@ class AutoCommissioner : public CommissioningDelegate
CHIP_ERROR NOCChainGenerated(ByteSpan noc, ByteSpan icac, ByteSpan rcac, AesCcm128KeySpan ipk, NodeId adminSubject);
Optional<System::Clock::Timeout> GetCommandTimeout(CommissioningStage stage) const;
EndpointId GetEndpoint(const CommissioningStage & stage) const;
CommissioningStage GetNextCommissioningStageInternal(CommissioningStage currentStage, CHIP_ERROR & lastErr);

DeviceCommissioner * mCommissioner = nullptr;
CommissioneeDeviceProxy * mCommissioneeDeviceProxy = nullptr;
Expand Down
147 changes: 135 additions & 12 deletions src/controller/CHIPDeviceController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,21 @@ CommissioneeDeviceProxy * DeviceCommissioner::FindCommissioneeDevice(NodeId id)
return foundDevice;
}

CommissioneeDeviceProxy * DeviceCommissioner::FindCommissioneeDevice(const Transport::PeerAddress & peerAddress)
{
CommissioneeDeviceProxy * foundDevice = nullptr;
mCommissioneeDevicePool.ForEachActiveObject([&](auto * deviceProxy) {
if (deviceProxy->GetPeerAddress() == peerAddress)
{
foundDevice = deviceProxy;
return Loop::Break;
}
return Loop::Continue;
});

return foundDevice;
}

void DeviceCommissioner::ReleaseCommissioneeDevice(CommissioneeDeviceProxy * device)
{
mCommissioneeDevicePool.ReleaseObject(device);
Expand Down Expand Up @@ -605,6 +620,7 @@ CHIP_ERROR DeviceCommissioner::EstablishPASEConnection(NodeId remoteDeviceId, Re
MATTER_TRACE_EVENT_SCOPE("EstablishPASEConnection", "DeviceCommissioner");
CHIP_ERROR err = CHIP_NO_ERROR;
CommissioneeDeviceProxy * device = nullptr;
CommissioneeDeviceProxy * current = nullptr;
Transport::PeerAddress peerAddress = Transport::PeerAddress::UDP(Inet::IPAddress::Any);

Messaging::ExchangeContext * exchangeCtxt = nullptr;
Expand Down Expand Up @@ -640,6 +656,33 @@ CHIP_ERROR DeviceCommissioner::EstablishPASEConnection(NodeId remoteDeviceId, Re
params.GetPeerAddress().GetInterface());
}

current = FindCommissioneeDevice(peerAddress);
if (current != nullptr)
{

if (current->IsSecureConnected())
{
if (mPairingDelegate)
{
// We already have an open secure session to this device, call the callback immediately and early return.
mPairingDelegate->OnPairingComplete(CHIP_NO_ERROR);
}
return CHIP_NO_ERROR;
}
else if (current->IsSessionSetupInProgress())
{
// We're not connected yet, but we're in the process of connecting. Pairing delegate will get a callback when connection
// completes
return CHIP_NO_ERROR;
}
else
{
// Something has gone strange. Delete the old device, try again.
ChipLogError(Controller, "Found unconnected device, removing");
ReleaseCommissioneeDevice(current);
}
}

device = mCommissioneeDevicePool.CreateObject();
VerifyOrExit(device != nullptr, err = CHIP_ERROR_NO_MEMORY);

Expand Down Expand Up @@ -1314,13 +1357,97 @@ void OnBasicFailure(void * context, CHIP_ERROR error)
commissioner->CommissioningStageComplete(error);
}

void DeviceCommissioner::CleanupCommissioning(DeviceProxy * proxy, NodeId nodeId, const CompletionStatus & completionStatus)
{
commissioningCompletionStatus = completionStatus;
if (completionStatus.err == CHIP_NO_ERROR ||
(completionStatus.failedStage.HasValue() && completionStatus.failedStage.Value() >= kWiFiNetworkSetup))
{
// If we Completed successfully, send the callbacks, we're done.
// If we were already doing network setup, we need to retain the pase session and start again from network setup stage.
// We do not need to reset the failsafe here because we want to keep everything on the device up to this point, so just
// send the completion callbacks.
CommissioningStageComplete(CHIP_NO_ERROR);
SendCommissioningCompleteCallbacks(nodeId, commissioningCompletionStatus);
}
else
{
// If we've failed somewhere in the early stages (or we don't have a failedStage specified), we need to start from the
// beginning. However, because some of the commands can only be sent once per arm-failsafe, we also need to force a reset on
// the failsafe so we can start fresh on the next attempt.
GeneralCommissioning::Commands::ArmFailSafe::Type request;
request.expiryLengthSeconds = 0; // Expire immediately.
request.breadcrumb = 0;
ChipLogProgress(Controller, "Expiring failsafe on proxy %p", proxy);
mDeviceBeingCommissioned = proxy;
// We actually want to do the same thing on success or failure because we're already in a failure state
SendCommand<GeneralCommissioningCluster>(proxy, request, OnDisarmFailsafe, OnDisarmFailsafeFailure);
}
}

void DeviceCommissioner::OnDisarmFailsafe(void * context,
const GeneralCommissioning::Commands::ArmFailSafeResponse::DecodableType & data)
{
ChipLogProgress(Controller, "Failsafe disarmed");
DeviceCommissioner * commissioner = static_cast<DeviceCommissioner *>(context);
commissioner->DisarmDone();
}

void DeviceCommissioner::OnDisarmFailsafeFailure(void * context, CHIP_ERROR error)
{
ChipLogProgress(Controller, "Received failure response when disarming failsafe%s\n", chip::ErrorStr(error));
DeviceCommissioner * commissioner = static_cast<DeviceCommissioner *>(context);
commissioner->DisarmDone();
}

void DeviceCommissioner::DisarmDone()
{
// At this point, we also want to close off the pase session so we need to re-establish
CommissioneeDeviceProxy * commissionee = FindCommissioneeDevice(mDeviceBeingCommissioned->GetDeviceId());

// Signal completion - this will reset mDeviceBeingCommissioned.
CommissioningStageComplete(CHIP_NO_ERROR);
SendCommissioningCompleteCallbacks(commissionee->GetDeviceId(), commissioningCompletionStatus);

// If we've disarmed the failsafe, it's because we're starting again, so kill the pase connection.
if (commissionee != nullptr)
{
ReleaseCommissioneeDevice(commissionee);
}
}

void DeviceCommissioner::SendCommissioningCompleteCallbacks(NodeId nodeId, const CompletionStatus & completionStatus)
{
mCommissioningStage = CommissioningStage::kSecurePairing;
if (mPairingDelegate == nullptr)
{
return;
}
mPairingDelegate->OnCommissioningComplete(nodeId, completionStatus.err);
PeerId peerId(GetCompressedFabricId(), nodeId);
if (completionStatus.err == CHIP_NO_ERROR)
{
mPairingDelegate->OnCommissioningSuccess(peerId);
}
else
{
mPairingDelegate->OnCommissioningFailure(peerId, completionStatus.err, completionStatus.failedStage.ValueOr(kError),
completionStatus.attestationResult);
}
}

void DeviceCommissioner::CommissioningStageComplete(CHIP_ERROR err, CommissioningDelegate::CommissioningReport report)
{
// Once this stage is complete, reset mDeviceBeingCommissioned - this will be reset when the delegate calls the next step.
MATTER_TRACE_EVENT_SCOPE("CommissioningStageComplete", "DeviceCommissioner");
NodeId nodeId = mDeviceBeingCommissioned->GetDeviceId();
DeviceProxy * proxy = mDeviceBeingCommissioned;
mDeviceBeingCommissioned = nullptr;

if (mPairingDelegate != nullptr)
{
mPairingDelegate->OnCommissioningStatusUpdate(PeerId(GetCompressedFabricId(), nodeId), mCommissioningStage, err);
}
if (mCommissioningDelegate == nullptr)
{
return;
Expand All @@ -1330,11 +1457,12 @@ void DeviceCommissioner::CommissioningStageComplete(CHIP_ERROR err, Commissionin
if (status != CHIP_NO_ERROR)
{
// Commissioning delegate will only return error if it failed to perform the appropriate commissioning step.
// In this case, we should call back the commissioning complete and call session error
if (mPairingDelegate != nullptr && mDeviceBeingCommissioned != nullptr)
{
mPairingDelegate->OnCommissioningComplete(nodeId, status);
}
// In this case, we should complete the commissioning for it.
CompletionStatus completionStatus;
completionStatus.err = status;
completionStatus.failedStage = MakeOptional(report.stageCompleted);
mCommissioningStage = CommissioningStage::kCleanup;
CleanupCommissioning(proxy, nodeId, completionStatus);
}
}

Expand Down Expand Up @@ -1604,7 +1732,7 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio
Optional<System::Clock::Timeout> timeout)
{
ChipLogProgress(Controller, "Performing next commissioning step '%s' with completion status = '%s'", StageToString(step),
params.GetCompletionStatus().AsString());
params.GetCompletionStatus().err.AsString());

// For now, we ignore errors coming in from the device since not all commissioning clusters are implemented on the device
// side.
Expand Down Expand Up @@ -1943,12 +2071,7 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio
}
break;
case CommissioningStage::kCleanup:
if (mPairingDelegate != nullptr)
{
mPairingDelegate->OnCommissioningComplete(proxy->GetDeviceId(), params.GetCompletionStatus());
}
CommissioningStageComplete(CHIP_NO_ERROR);
mCommissioningStage = CommissioningStage::kSecurePairing;
CleanupCommissioning(proxy, proxy->GetDeviceId(), params.GetCompletionStatus());
break;
case CommissioningStage::kError:
mCommissioningStage = CommissioningStage::kSecurePairing;
Expand Down
Loading

0 comments on commit c3b1810

Please sign in to comment.