diff --git a/docs/component-status.md b/docs/component-status.md index bcc1a73f013..894f1eac2c0 100644 --- a/docs/component-status.md +++ b/docs/component-status.md @@ -36,7 +36,7 @@ There is a finite state machine underlying the status reporting API that governs ![State Diagram](img/component-status-state-diagram.png) -The finite state machine ensures that components progress through the lifecycle properly and it manages transitions through runtime states so that components do not need to track their state internally. Only changes in status result in new events being generated; repeat reports of the same status are ignored. PermanentError and FatalError are permanent runtime states. A component in these states cannot make any further state transitions. +The finite state machine ensures that components progress through the lifecycle properly and it manages transitions through runtime states so that components do not need to track their state internally. Only changes in status result in new events being generated; repeat reports of the same status are ignored. PermanentError is a permanent runtime state. A component in a PermanentError state cannot transtion to OK or RecoverableError, but it can transition to Stopping. FatalError is a final state. A component in a FatalError state cannot make any further state transitions. ![Status Event Generation](img/component-status-event-generation.png) @@ -61,6 +61,7 @@ Under most circumstances, a component does not need to report explicit status du **Runtime** ![Runtime State Diagram](img/component-status-runtime-states.png) + During runtime a component should not have to keep track of its state. A component should report status as operations succeed or fail and the finite state machine will handle the rest. Changes in status will result in new status events being emitted. Repeat reports of the same status will no-op. Similarly, attempts to make an invalid state transition, such as PermanentError to OK, will have no effect. We intend to define guidelines to help component authors distinguish between recoverable and permanent errors on a per-component type basis and we'll update this document as we make decisions. See [this issue](https://github.com/open-telemetry/opentelemetry-collector/issues/9957) for current thoughts and discussions. diff --git a/docs/img/component-status-state-diagram.png b/docs/img/component-status-state-diagram.png index ad9735edfe3..20911f7ae22 100644 Binary files a/docs/img/component-status-state-diagram.png and b/docs/img/component-status-state-diagram.png differ diff --git a/service/internal/graph/lifecycle_test.go b/service/internal/graph/lifecycle_test.go index 9fb91f5af6f..07db29df3e4 100644 --- a/service/internal/graph/lifecycle_test.go +++ b/service/internal/graph/lifecycle_test.go @@ -351,6 +351,8 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) { instanceIDs[eStErr]: { componentstatus.NewEvent(componentstatus.StatusStarting), componentstatus.NewPermanentErrorEvent(assert.AnError), + componentstatus.NewEvent(componentstatus.StatusStopping), + componentstatus.NewEvent(componentstatus.StatusStopped), }, }, startupErr: assert.AnError, @@ -362,6 +364,8 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) { instanceIDs[rStErr]: { componentstatus.NewEvent(componentstatus.StatusStarting), componentstatus.NewPermanentErrorEvent(assert.AnError), + componentstatus.NewEvent(componentstatus.StatusStopping), + componentstatus.NewEvent(componentstatus.StatusStopped), }, instanceIDs[eNoErr]: { componentstatus.NewEvent(componentstatus.StatusStarting), diff --git a/service/internal/status/status.go b/service/internal/status/status.go index a27bff618dd..5a44a02665f 100644 --- a/service/internal/status/status.go +++ b/service/internal/status/status.go @@ -70,8 +70,10 @@ func newFSM(onTransition onTransitionFunc) *fsm { componentstatus.StatusFatalError: {}, componentstatus.StatusStopping: {}, }, - componentstatus.StatusPermanentError: {}, - componentstatus.StatusFatalError: {}, + componentstatus.StatusPermanentError: { + componentstatus.StatusStopping: {}, + }, + componentstatus.StatusFatalError: {}, componentstatus.StatusStopping: { componentstatus.StatusRecoverableError: {}, componentstatus.StatusPermanentError: {}, diff --git a/service/internal/status/status_test.go b/service/internal/status/status_test.go index 6a61b863a26..d744e1aa694 100644 --- a/service/internal/status/status_test.go +++ b/service/internal/status/status_test.go @@ -75,17 +75,19 @@ func TestStatusFSM(t *testing.T) { expectedErrorCount: 2, }, { - name: "PermanentError is terminal", + name: "PermanentError is stoppable", reportedStatuses: []componentstatus.Status{ componentstatus.StatusStarting, componentstatus.StatusOK, componentstatus.StatusPermanentError, componentstatus.StatusOK, + componentstatus.StatusStopping, }, expectedStatuses: []componentstatus.Status{ componentstatus.StatusStarting, componentstatus.StatusOK, componentstatus.StatusPermanentError, + componentstatus.StatusStopping, }, expectedErrorCount: 1, }, @@ -154,7 +156,7 @@ func TestValidSeqsToStopped(t *testing.T) { } for _, ev := range events { - name := fmt.Sprintf("transition from: %s to: %s invalid", ev.Status(), componentstatus.StatusStopped) + name := fmt.Sprintf("transition from: %s to: %s", ev.Status(), componentstatus.StatusStopped) t.Run(name, func(t *testing.T) { fsm := newFSM(func(*componentstatus.Event) {}) if ev.Status() != componentstatus.StatusStarting { @@ -165,9 +167,9 @@ func TestValidSeqsToStopped(t *testing.T) { err := fsm.transition(componentstatus.NewEvent(componentstatus.StatusStopped)) require.ErrorIs(t, err, errInvalidStateTransition) - // stopping -> stopped is allowed for non-fatal, non-permanent errors + // stopping -> stopped is allowed for non-fatal errors err = fsm.transition(componentstatus.NewEvent(componentstatus.StatusStopping)) - if ev.Status() == componentstatus.StatusPermanentError || ev.Status() == componentstatus.StatusFatalError { + if ev.Status() == componentstatus.StatusFatalError { require.ErrorIs(t, err, errInvalidStateTransition) } else { require.NoError(t, err)