Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Container Restart Policy feature #4264

Merged
merged 12 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions agent/api/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ import (

resourcestatus "github.com/aws/amazon-ecs-agent/agent/taskresource/status"
referenceutil "github.com/aws/amazon-ecs-agent/agent/utils/reference"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/container/restart"
apicontainerstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/container/status"
apierrors "github.com/aws/amazon-ecs-agent/ecs-agent/api/errors"
"github.com/aws/amazon-ecs-agent/ecs-agent/credentials"
"github.com/aws/amazon-ecs-agent/ecs-agent/logger"
"github.com/aws/amazon-ecs-agent/ecs-agent/logger/field"

"github.com/aws/aws-sdk-go/aws"
"github.com/cihub/seelog"
"github.com/docker/docker/api/types"
Expand Down Expand Up @@ -323,9 +323,16 @@ type Container struct {
// pause container
ContainerTornDownUnsafe bool `json:"containerTornDown"`

createdAt time.Time
startedAt time.Time
finishedAt time.Time
createdAt time.Time
// StartedAtUnsafe specifies the started at time of the container.
// It is exposed outside this container package so that it is marshalled/unmarshalled in JSON body while
// saving state.
// NOTE: Do not access StartedAtUnsafe directly. Instead, use `GetStartedAt` and `SetStartedAt`.
StartedAtUnsafe time.Time `json:"startedAt,omitempty"`
// setStartedAtOnce is used to set the value of the container's started at time only the first time SetStartedAt is
// invoked.
setStartedAtOnce sync.Once
finishedAt time.Time

labels map[string]string

Expand All @@ -335,13 +342,31 @@ type Container struct {
ContainerPortSet map[int]struct{}
// ContainerPortRangeMap is a map of containerPortRange to its associated hostPortRange
ContainerPortRangeMap map[string]string

// RestartPolicy is an object representing the restart policy of the container
RestartPolicy *restart.RestartPolicy `json:"restartPolicy,omitempty"`
// RestartTracker tracks this container's restart policy metadata, such
// as restart count and last restart time. This is only initialized if the container
// has a restart policy defined and enabled.
RestartTracker *restart.RestartTracker `json:"restartTracker,omitempty"`
// RestartAggregationDataForStatsUnsafe specifies the restart aggregation data used for stats of the container.
// It is exposed outside this container package so that it is marshalled/unmarshalled in JSON body while
// saving state.
// NOTE: Do not access RestartAggregationDataForStatsUnsafe directly. Instead, use
// `GetRestartAggregationDataForStats` and `SetRestartAggregationDataForStats`.
RestartAggregationDataForStatsUnsafe ContainerRestartAggregationDataForStats `json:"RestartAggregationDataForStats,omitempty"`
}

type DependsOn struct {
ContainerName string `json:"containerName"`
Condition string `json:"condition"`
}

type ContainerRestartAggregationDataForStats struct {
LastRestartDetectedAt time.Time `json:"LastRestartDetectedAt,omitempty"`
LastStatBeforeLastRestart types.StatsJSON `json:"LastStatBeforeLastRestart,omitempty"`
}

// DockerContainer is a mapping between containers-as-docker-knows-them and
// containers-as-we-know-them.
// This is primarily used in DockerState, but lives here such that tasks and
Expand Down Expand Up @@ -628,6 +653,16 @@ func (c *Container) IsEssential() bool {
return c.Essential
}

// RestartPolicyEnabled returns whether the restart policy is defined and enabled
func (c *Container) RestartPolicyEnabled() bool {
c.lock.RLock()
defer c.lock.RUnlock()
if c.RestartPolicy == nil {
return false
}
return c.RestartPolicy.Enabled
}

// AWSLogAuthExecutionRole returns true if the auth is by execution role
func (c *Container) AWSLogAuthExecutionRole() bool {
return c.LogsAuthStrategy == awslogsAuthExecutionRole
Expand All @@ -653,7 +688,11 @@ func (c *Container) SetStartedAt(startedAt time.Time) {
c.lock.Lock()
defer c.lock.Unlock()

c.startedAt = startedAt
c.setStartedAtOnce.Do(func() {
if c.StartedAtUnsafe.IsZero() {
c.StartedAtUnsafe = startedAt
}
})
}

// SetFinishedAt sets the timestamp for container's stopped time
Expand Down Expand Up @@ -681,7 +720,7 @@ func (c *Container) GetStartedAt() time.Time {
c.lock.RLock()
defer c.lock.RUnlock()

return c.startedAt
return c.StartedAtUnsafe
}

// GetFinishedAt sets the timestamp for container's stopped time
Expand Down Expand Up @@ -1536,3 +1575,20 @@ func (c *Container) DigestResolved() bool {
func (c *Container) DigestResolutionRequired() bool {
return !c.IsInternal() && referenceutil.GetDigestFromImageRef(c.Image) == ""
}

// GetRestartAggregationDataForStats gets the restart aggregation data for stats of a container.
func (c *Container) GetRestartAggregationDataForStats() ContainerRestartAggregationDataForStats {
c.lock.RLock()
defer c.lock.RUnlock()

return c.RestartAggregationDataForStatsUnsafe
}

// SetRestartAggregationDataForStats sets the restart aggregation data for stats of a container.
func (c *Container) SetRestartAggregationDataForStats(
restartAggregationDataForStats ContainerRestartAggregationDataForStats) {
c.lock.Lock()
defer c.lock.Unlock()

c.RestartAggregationDataForStatsUnsafe = restartAggregationDataForStats
}
115 changes: 115 additions & 0 deletions agent/api/container/container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ import (
"time"

resourcestatus "github.com/aws/amazon-ecs-agent/agent/taskresource/status"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/container/restart"
apicontainerstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/container/status"
"github.com/docker/docker/api/types"

"github.com/aws/amazon-ecs-agent/agent/utils"
dockercontainer "github.com/docker/docker/api/types/container"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

type configPair struct {
Expand Down Expand Up @@ -1334,6 +1337,118 @@ func TestGetCredentialSpec(t *testing.T) {
}
}

func TestRestartPolicy(t *testing.T) {
testCases := []struct {
name string
container *Container
restartCount int
expectedEnabled bool
}{
{
name: "nil restart policy",
container: &Container{
RestartPolicy: nil,
},
restartCount: 0,
expectedEnabled: false,
},
{
name: "not enabled restart policy",
container: &Container{
RestartPolicy: &restart.RestartPolicy{},
},
restartCount: 0,
expectedEnabled: false,
},
{
name: "explicitly not enabled restart policy",
container: &Container{
RestartPolicy: &restart.RestartPolicy{
Enabled: false,
},
},
restartCount: 0,
expectedEnabled: false,
},
{
name: "enabled restart policy",
container: &Container{
RestartPolicy: &restart.RestartPolicy{
Enabled: true,
},
},
restartCount: 0,
expectedEnabled: true,
},
{
name: "enabled restart policy, record 5 restarts",
container: &Container{
RestartPolicy: &restart.RestartPolicy{
Enabled: true,
},
},
restartCount: 5,
expectedEnabled: true,
},
}

for _, tc := range testCases {
require.Equal(t, tc.expectedEnabled, tc.container.RestartPolicyEnabled())
if tc.container.RestartPolicyEnabled() {
tc.container.RestartTracker = restart.NewRestartTracker(*tc.container.RestartPolicy)
for i := 0; i < tc.restartCount; i++ {
tc.container.RestartTracker.RecordRestart()
}
require.Equal(t, tc.restartCount, tc.container.RestartTracker.GetRestartCount())
}
}
}

func TestGetAndSetStartedAt(t *testing.T) {
testTime := time.Date(1969, 12, 31, 23, 59, 59, 0, time.UTC)
c := &Container{}

// Test getting started at time when it has never been set is the zero value of time.
require.True(t, c.GetStartedAt().IsZero())

// Test setting started at time sets the started at time.
c.SetStartedAt(testTime)
require.Equal(t, testTime, c.GetStartedAt())

// Test setting started at time after it has already been set does not change the originally set started at time.
c.SetStartedAt(time.Now())
require.Equal(t, testTime, c.GetStartedAt())
}

func TestGetAndSetRestartAggregationDataForStats(t *testing.T) {
testTime := time.Date(1969, 12, 31, 23, 59, 59, 0, time.UTC)
testStatsJSON := types.StatsJSON{
Stats: types.Stats{
CPUStats: types.CPUStats{
CPUUsage: types.CPUUsage{
TotalUsage: 100,
},
},
MemoryStats: types.MemoryStats{
MaxUsage: 200,
},
},
}
testRestartAggregationDataForStats := ContainerRestartAggregationDataForStats{
LastRestartDetectedAt: testTime,
LastStatBeforeLastRestart: testStatsJSON,
}
c := &Container{}

// Test getting restart aggregation data for stats when it has never been set is the zero value of the
// ContainerRestartAggregationDataForStats struct.
require.Equal(t, ContainerRestartAggregationDataForStats{}, c.GetRestartAggregationDataForStats())

// Test setting restart aggregation data for stats sets the restart aggregation data for stats.
c.SetRestartAggregationDataForStats(testRestartAggregationDataForStats)
require.Equal(t, testRestartAggregationDataForStats, c.GetRestartAggregationDataForStats())
}

func getContainer(hostConfig string, credentialSpecs []string) *Container {
c := &Container{
Name: "c",
Expand Down
13 changes: 13 additions & 0 deletions agent/api/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
taskresourcevolume "github.com/aws/amazon-ecs-agent/agent/taskresource/volume"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/ecs-agent/acs/model/ecsacs"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/container/restart"
apicontainerstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/container/status"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/model/ecs"
apierrors "github.com/aws/amazon-ecs-agent/ecs-agent/api/errors"
Expand Down Expand Up @@ -466,6 +467,8 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config,
}
}

task.initRestartTrackers()

for _, opt := range options {
if err := opt(task); err != nil {
logger.Error("Could not apply task option", logger.Fields{
Expand Down Expand Up @@ -523,6 +526,16 @@ func (task *Task) initNetworkMode(acsTaskNetworkMode *string) {
}
}

// initRestartTrackers initializes the restart policy tracker for each container
// that has a restart policy configured and enabled.
func (task *Task) initRestartTrackers() {
for _, c := range task.Containers {
if c.RestartPolicyEnabled() {
c.RestartTracker = restart.NewRestartTracker(*c.RestartPolicy)
}
}
}

func (task *Task) initServiceConnectResources() error {
// TODO [SC]: ServiceConnectConfig will come from ACS. Adding this here for dev/testing purposes only Remove when
// ACS model is integrated
Expand Down
38 changes: 38 additions & 0 deletions agent/api/task/task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/ecs-agent/acs/model/ecsacs"
apiresource "github.com/aws/amazon-ecs-agent/ecs-agent/api/attachment/resource"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/container/restart"
apicontainerstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/container/status"
"github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/model/ecs"
apitaskstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/task/status"
Expand Down Expand Up @@ -3908,6 +3909,43 @@ func TestPostUnmarshalTaskEnvfiles(t *testing.T) {
task.Containers[0].TransitionDependenciesMap[apicontainerstatus.ContainerCreated].ResourceDependencies[0])
}

func TestPostUnmarshalTaskContainerRestartPolicy(t *testing.T) {
container := &apicontainer.Container{
Name: "containerName",
Image: "image:tag",
RestartPolicy: &restart.RestartPolicy{
Enabled: true,
},
TransitionDependenciesMap: make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet),
}

task := &Task{
Arn: testTaskARN,
ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource),
Containers: []*apicontainer.Container{container},
}

ctrl := gomock.NewController(t)
defer ctrl.Finish()

cfg := &config.Config{}
credentialsManager := mock_credentials.NewMockManager(ctrl)
resFields := &taskresource.ResourceFields{
ResourceFieldsCommon: &taskresource.ResourceFieldsCommon{
CredentialsManager: credentialsManager,
},
}

err := task.PostUnmarshalTask(cfg, credentialsManager, resFields, nil, nil)
assert.NoError(t, err)

for _, c := range task.Containers {
assert.True(t, c.RestartPolicyEnabled())
assert.NotNil(t, c.RestartTracker)
assert.Equal(t, 0, c.RestartTracker.GetRestartCount())
}
}

func TestInitializeAndGetEnvfilesResource(t *testing.T) {
envfile1 := apicontainer.EnvironmentFile{
Value: "s3://bucket/envfile1",
Expand Down
2 changes: 1 addition & 1 deletion agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@ func (agent *ecsAgent) startAsyncRoutines(
telemetryMessages := make(chan ecstcs.TelemetryMessage, telemetryChannelDefaultBufferSize)
healthMessages := make(chan ecstcs.HealthMessage, telemetryChannelDefaultBufferSize)

statsEngine := stats.NewDockerStatsEngine(agent.cfg, agent.dockerClient, containerChangeEventStream, telemetryMessages, healthMessages)
statsEngine := stats.NewDockerStatsEngine(agent.cfg, agent.dockerClient, containerChangeEventStream, telemetryMessages, healthMessages, agent.dataClient)

// Start serving the endpoint to fetch IAM Role credentials and other task metadata
if agent.cfg.TaskMetadataAZDisabled {
Expand Down
4 changes: 4 additions & 0 deletions agent/app/agent_capability.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ const (
capabilityServiceConnect = "service-connect-v1"
capabilityGpuDriverVersion = "gpu-driver-version"
capabilityEBSTaskAttach = "storage.ebs-task-volume-attach"
capabilityContainerRestartPolicy = "container-restart-policy"

// network capabilities, going forward, please append "network." prefix to any new networking capability we introduce
networkCapabilityPrefix = "network."
Expand Down Expand Up @@ -110,6 +111,8 @@ var (
capabilityEnvFilesS3,
// support container port range in container definition - port mapping field
capabilityContainerPortRange,
// support container restart policy
capabilityContainerRestartPolicy,
}
// use empty struct as value type to simulate set
capabilityExecInvalidSsmVersions = map[string]struct{}{}
Expand Down Expand Up @@ -194,6 +197,7 @@ var (
// ecs.capability.external
// ecs.capability.service-connect-v1
// ecs.capability.network.container-port-range
// ecs.capability.container-restart-policy
func (agent *ecsAgent) capabilities() ([]*ecs.Attribute, error) {
var capabilities []*ecs.Attribute

Expand Down
2 changes: 2 additions & 0 deletions agent/app/agent_capability_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ func TestCapabilities(t *testing.T) {
attributePrefix + capabilityExec,
attributePrefix + capabilityServiceConnect,
attributePrefix + capabilityContainerPortRange,
attributePrefix + capabilityContainerRestartPolicy,
}

var expectedCapabilities []*ecs.Attribute
Expand Down Expand Up @@ -1314,6 +1315,7 @@ func TestCapabilitiesNoServiceConnect(t *testing.T) {
attributePrefix + taskENIBlockInstanceMetadataAttributeSuffix,
attributePrefix + capabilityExec,
attributePrefix + capabilityContainerPortRange,
attributePrefix + capabilityContainerRestartPolicy,
}

var expectedCapabilities []*ecs.Attribute
Expand Down
Loading
Loading