diff --git a/lib/autoupdate/rollout/client.go b/lib/autoupdate/rollout/client.go index 16b9f5197fb2b..bde2267d095de 100644 --- a/lib/autoupdate/rollout/client.go +++ b/lib/autoupdate/rollout/client.go @@ -22,6 +22,7 @@ import ( "context" autoupdatepb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/api/types" ) // Client is the subset of the Teleport client RPCs the controller needs. @@ -43,4 +44,7 @@ type Client interface { // DeleteAutoUpdateAgentRollout deletes the AutoUpdateAgentRollout singleton resource. DeleteAutoUpdateAgentRollout(ctx context.Context) error + + // GetClusterMaintenanceConfig loads the current maintenance config singleton. + GetClusterMaintenanceConfig(ctx context.Context) (types.ClusterMaintenanceConfig, error) } diff --git a/lib/autoupdate/rollout/client_test.go b/lib/autoupdate/rollout/client_test.go index fc0e49bd04763..782251a562025 100644 --- a/lib/autoupdate/rollout/client_test.go +++ b/lib/autoupdate/rollout/client_test.go @@ -24,8 +24,11 @@ import ( "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/protoadapt" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/api/types" + apiutils "github.com/gravitational/teleport/api/utils" ) // mockClient is a mock implementation if the Client interface for testing purposes. @@ -39,6 +42,7 @@ type mockClient struct { createAutoUpdateAgentRollout *createUpdateHandler[*autoupdate.AutoUpdateAgentRollout] updateAutoUpdateAgentRollout *createUpdateHandler[*autoupdate.AutoUpdateAgentRollout] deleteAutoUpdateAgentRollout *deleteHandler + getClusterMaintenanceConfig *legacyGetHandler[*types.ClusterMaintenanceConfigV1] } func (m mockClient) GetAutoUpdateConfig(ctx context.Context) (*autoupdate.AutoUpdateConfig, error) { @@ -65,6 +69,10 @@ func (m mockClient) DeleteAutoUpdateAgentRollout(ctx context.Context) error { return m.deleteAutoUpdateAgentRollout.handle(ctx) } +func (m mockClient) GetClusterMaintenanceConfig(ctx context.Context) (types.ClusterMaintenanceConfig, error) { + return m.getClusterMaintenanceConfig.handle(ctx) +} + func (m mockClient) checkIfEmpty(t *testing.T) { require.True(t, m.getAutoUpdateConfig.isEmpty(), "Get autoupdate_config mock not empty") require.True(t, m.getAutoUpdateVersion.isEmpty(), "Get autoupdate_version mock not empty") @@ -72,6 +80,7 @@ func (m mockClient) checkIfEmpty(t *testing.T) { require.True(t, m.createAutoUpdateAgentRollout.isEmpty(), "Create autoupdate_agent_rollout mock not empty") require.True(t, m.updateAutoUpdateAgentRollout.isEmpty(), "Update autoupdate_agent_rollout mock not empty") require.True(t, m.deleteAutoUpdateAgentRollout.isEmpty(), "Delete autoupdate_agent_rollout mock not empty") + require.True(t, m.getClusterMaintenanceConfig.isEmpty(), "Get cluster_maintenance config mock not empty") } func newMockClient(t *testing.T, stubs mockClientStubs) *mockClient { @@ -86,6 +95,7 @@ func newMockClient(t *testing.T, stubs mockClientStubs) *mockClient { createAutoUpdateAgentRollout: &createUpdateHandler[*autoupdate.AutoUpdateAgentRollout]{t, stubs.createRolloutExpects, stubs.createRolloutAnswers}, updateAutoUpdateAgentRollout: &createUpdateHandler[*autoupdate.AutoUpdateAgentRollout]{t, stubs.updateRolloutExpects, stubs.updateRolloutAnswers}, deleteAutoUpdateAgentRollout: &deleteHandler{t, stubs.deleteRolloutAnswers}, + getClusterMaintenanceConfig: &legacyGetHandler[*types.ClusterMaintenanceConfigV1]{t, stubs.cmcAnswers}, } } @@ -98,6 +108,7 @@ type mockClientStubs struct { updateRolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] updateRolloutExpects []require.ValueAssertionFunc deleteRolloutAnswers []error + cmcAnswers []callAnswer[*types.ClusterMaintenanceConfigV1] } type callAnswer[T any] struct { @@ -131,6 +142,35 @@ func (h *getHandler[T]) isEmpty() bool { return len(h.answers) == 0 } +// legacyGetHandler is a getHandler for legacy teleport types (gogo proto-based) +// A first iteration was trying to be smart and reuse the getHandler logic +// by converting fixtures before to protoadapt.MessageV2, and converting back to +// protoadapt.MessageV1 before returning. The resulting code was hard to read and +// duplicating the logic seems more maintainable. +type legacyGetHandler[T protoadapt.MessageV1] struct { + t *testing.T + answers []callAnswer[T] +} + +func (h *legacyGetHandler[T]) handle(_ context.Context) (T, error) { + if len(h.answers) == 0 { + require.Fail(h.t, "no answers left") + } + + entry := h.answers[0] + h.answers = h.answers[1:] + + // We need to deep copy because the reconciler might do updates in place. + // We don't want the original resource to be edited as this would mess with other tests. + result := apiutils.CloneProtoMsg(entry.result) + return result, entry.err +} + +// isEmpty returns true only if all stubs were consumed +func (h *legacyGetHandler[T]) isEmpty() bool { + return len(h.answers) == 0 +} + // createUpdateHandler is used in a mock client to answer create or update resource requests during tests (any request whose arity is 2). // It first validates the input using the provided validation function, then it returns the predefined answer and error. // If there are no stubs left it fails the test. diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go index 13bce5d88b6d4..5b83abab2b673 100644 --- a/lib/autoupdate/rollout/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -30,6 +30,7 @@ import ( "google.golang.org/protobuf/types/known/timestamppb" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/api/types" update "github.com/gravitational/teleport/api/types/autoupdate" "github.com/gravitational/teleport/api/utils" ) @@ -40,8 +41,9 @@ const ( defaultStrategy = update.AgentsStrategyHaltOnError maxConflictRetry = 3 - defaultGroupName = "default" - defaultStartHour = 12 + defaultGroupName = "default" + defaultCMCGroupName = defaultGroupName + "-cmc" + defaultStartHour = 12 ) var ( @@ -294,7 +296,7 @@ func (r *reconciler) computeStatus( groups := status.GetGroups() var err error if len(groups) == 0 { - groups, err = makeGroupsStatus(configSchedules, now) + groups, err = r.makeGroupsStatus(ctx, configSchedules, now) if err != nil { return nil, trace.Wrap(err, "creating groups status") } @@ -302,7 +304,7 @@ func (r *reconciler) computeStatus( err = r.progressRollout(ctx, newSpec.GetStrategy(), groups) // Failing to progress the update is not a hard failure. - // We expected to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user. + // We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user. if err != nil { r.log.ErrorContext(ctx, "Errors encountered during rollout progress. Some groups might not get updated properly.", "error", err) @@ -328,10 +330,10 @@ func (r *reconciler) progressRollout(ctx context.Context, strategyName string, g // makeGroupStatus creates the autoupdate_agent_rollout.status.groups based on the autoupdate_config. // This should be called if the status groups have not been initialized or must be reset. -func makeGroupsStatus(schedules *autoupdate.AgentAutoUpdateSchedules, now time.Time) ([]*autoupdate.AutoUpdateAgentRolloutStatusGroup, error) { +func (r *reconciler) makeGroupsStatus(ctx context.Context, schedules *autoupdate.AgentAutoUpdateSchedules, now time.Time) ([]*autoupdate.AutoUpdateAgentRolloutStatusGroup, error) { configGroups := schedules.GetRegular() if len(configGroups) == 0 { - defaultGroup, err := defaultConfigGroup() + defaultGroup, err := r.defaultConfigGroup(ctx) if err != nil { return nil, trace.Wrap(err, "retrieving default group") } @@ -357,12 +359,45 @@ func makeGroupsStatus(schedules *autoupdate.AgentAutoUpdateSchedules, now time.T // defaultConfigGroup returns the default group in case of missing autoupdate_config resource. // This is a function and not a variable because we will need to add more logic there in the future // lookup maintenance information from RFD 109's cluster_maintenance_config. -func defaultConfigGroup() (*autoupdate.AgentAutoUpdateGroup, error) { - // TODO: get group from CMC if possible +func (r *reconciler) defaultConfigGroup(ctx context.Context) (*autoupdate.AgentAutoUpdateGroup, error) { + cmc, err := r.clt.GetClusterMaintenanceConfig(ctx) + if err != nil { + if trace.IsNotFound(err) { + // There's no CMC, we return the default group. + return defaultGroup(), nil + } + + // If we had an error, and it's not trace.ErrNotFound, we stop. + return nil, trace.Wrap(err, "retrieving the cluster maintenance config") + } + // We got a CMC, we generate the default from it. + upgradeWindow, ok := cmc.GetAgentUpgradeWindow() + + if !ok { + // The CMC is here but does not contain upgrade window. + return defaultGroup(), nil + } + + weekdays := upgradeWindow.Weekdays + // A CMC upgrade window not specifying weekdays should update every day. + if len(weekdays) == 0 { + weekdays = []string{types.Wildcard} + } + + return &autoupdate.AgentAutoUpdateGroup{ + Name: defaultCMCGroupName, + Days: weekdays, + StartHour: int32(upgradeWindow.UTCStartHour), + WaitDays: 0, + }, nil + +} + +func defaultGroup() *autoupdate.AgentAutoUpdateGroup { return &autoupdate.AgentAutoUpdateGroup{ Name: defaultGroupName, Days: defaultUpdateDays, StartHour: defaultStartHour, WaitDays: 0, - }, nil + } } diff --git a/lib/autoupdate/rollout/reconciler_test.go b/lib/autoupdate/rollout/reconciler_test.go index 048a811b880b4..ac83d479d0e57 100644 --- a/lib/autoupdate/rollout/reconciler_test.go +++ b/lib/autoupdate/rollout/reconciler_test.go @@ -33,6 +33,7 @@ import ( "google.golang.org/protobuf/types/known/timestamppb" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/api/types" update "github.com/gravitational/teleport/api/types/autoupdate" apiutils "github.com/gravitational/teleport/api/utils" "github.com/gravitational/teleport/lib/backend" @@ -576,6 +577,7 @@ func TestReconciler_Reconcile(t *testing.T) { func Test_makeGroupsStatus(t *testing.T) { now := time.Now() + ctx := context.Background() tests := []struct { name string @@ -680,7 +682,15 @@ func Test_makeGroupsStatus(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result, err := makeGroupsStatus(tt.schedules, now) + // We craft a mock client always answering there's no cmc. + // It's not the point of this test to check the cmc client usage so we don't count the number of calls here. + // CMC-specific tests happen in TestDefaultConfigGroup(). + clt := newMockClient(t, mockClientStubs{cmcAnswers: []callAnswer[*types.ClusterMaintenanceConfigV1]{{ + result: nil, + err: trace.NotFound("no cmc"), + }}}) + r := reconciler{clt: clt} + result, err := r.makeGroupsStatus(ctx, tt.schedules, now) require.NoError(t, err) require.Equal(t, tt.expected, result) }) @@ -734,7 +744,8 @@ func Test_reconciler_computeStatus(t *testing.T) { }, }, } - newGroups, err := makeGroupsStatus(schedules, clock.Now()) + r := reconciler{} + newGroups, err := r.makeGroupsStatus(ctx, schedules, clock.Now()) require.NoError(t, err) newStatus := &autoupdate.AutoUpdateAgentRolloutStatus{ Groups: newGroups, @@ -863,3 +874,97 @@ func Test_reconciler_computeStatus(t *testing.T) { }) } } + +func TestDefaultConfigGroup(t *testing.T) { + ctx := context.Background() + testStartHour := 16 + + tests := []struct { + name string + cmcAnswer callAnswer[*types.ClusterMaintenanceConfigV1] + expectedResult *autoupdate.AgentAutoUpdateGroup + expectError require.ErrorAssertionFunc + }{ + { + name: "no CMC", + cmcAnswer: callAnswer[*types.ClusterMaintenanceConfigV1]{ + nil, trace.NotFound("no cmc"), + }, + expectedResult: defaultGroup(), + expectError: require.NoError, + }, + { + name: "CMC with no upgrade window", + cmcAnswer: callAnswer[*types.ClusterMaintenanceConfigV1]{ + &types.ClusterMaintenanceConfigV1{ + Spec: types.ClusterMaintenanceConfigSpecV1{ + AgentUpgrades: nil, + }, + }, nil, + }, + expectedResult: defaultGroup(), + expectError: require.NoError, + }, + { + name: "CMC with no weekdays", + cmcAnswer: callAnswer[*types.ClusterMaintenanceConfigV1]{ + &types.ClusterMaintenanceConfigV1{ + Spec: types.ClusterMaintenanceConfigSpecV1{ + AgentUpgrades: &types.AgentUpgradeWindow{ + UTCStartHour: uint32(testStartHour), + Weekdays: nil, + }, + }, + }, nil, + }, + expectedResult: &autoupdate.AgentAutoUpdateGroup{ + Name: defaultCMCGroupName, + Days: []string{"*"}, + StartHour: int32(testStartHour), + WaitDays: 0, + }, + expectError: require.NoError, + }, + { + name: "CMC with weekdays", + cmcAnswer: callAnswer[*types.ClusterMaintenanceConfigV1]{ + &types.ClusterMaintenanceConfigV1{ + Spec: types.ClusterMaintenanceConfigSpecV1{ + AgentUpgrades: &types.AgentUpgradeWindow{ + UTCStartHour: uint32(testStartHour), + Weekdays: everyWeekdayButSunday, + }, + }, + }, nil, + }, + expectedResult: &autoupdate.AgentAutoUpdateGroup{ + Name: defaultCMCGroupName, + Days: everyWeekdayButSunday, + StartHour: int32(testStartHour), + WaitDays: 0, + }, + expectError: require.NoError, + }, + { + name: "unexpected error getting CMC", + cmcAnswer: callAnswer[*types.ClusterMaintenanceConfigV1]{ + nil, trace.ConnectionProblem(trace.Errorf("oh no"), "connection failed"), + }, + expectedResult: nil, + expectError: require.Error, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test setup: loading fixtures. + clt := newMockClient(t, mockClientStubs{cmcAnswers: []callAnswer[*types.ClusterMaintenanceConfigV1]{tt.cmcAnswer}}) + r := &reconciler{clt: clt} + // Test execution. + result, err := r.defaultConfigGroup(ctx) + tt.expectError(t, err) + require.Equal(t, tt.expectedResult, result) + // Test validation: the mock client should be empty. + clt.checkIfEmpty(t) + }) + } +}