From f8bb8296bd906d52fb78f5065fd19f55e5c610c6 Mon Sep 17 00:00:00 2001 From: Angel Velazquez Date: Mon, 21 Feb 2022 18:58:54 -0800 Subject: [PATCH] Improve log readability --- .../acs/handler/attach_eni_handler_common.go | 21 +- agent/acs/handler/payload_handler.go | 14 +- agent/api/ecsclient/client.go | 16 +- agent/api/eni/eniattachment.go | 28 +- agent/api/task/task.go | 261 +++++--- agent/api/task/task_linux.go | 7 +- agent/api/task/task_test.go | 31 +- agent/app/agent.go | 68 +- agent/dockerclient/dockerapi/docker_client.go | 7 +- agent/engine/common_test.go | 1 - agent/engine/docker_image_manager.go | 7 +- agent/engine/docker_task_engine.go | 598 +++++++++++++----- agent/engine/docker_task_engine_linux_test.go | 13 +- agent/engine/docker_task_engine_windows.go | 10 +- agent/engine/engine_sudo_linux_integ_test.go | 2 +- agent/engine/execcmd/manager_start.go | 69 +- agent/engine/image/types.go | 21 +- agent/engine/task_manager.go | 230 +++---- agent/engine/task_manager_unix_test.go | 2 +- agent/eni/watcher/watcher_linux.go | 15 +- agent/eventhandler/task_handler_types.go | 20 +- agent/logger/field/constants.go | 8 +- agent/stats/engine.go | 73 ++- agent/stats/task.go | 42 +- agent/stats/task_linux.go | 3 +- agent/stats/task_unspecified.go | 2 +- agent/stats/task_windows.go | 2 +- agent/stats/types.go | 1 + agent/taskresource/asmsecret/asmsecret.go | 4 +- .../taskresource/cgroup/control/init_linux.go | 6 +- agent/taskresource/ssmsecret/ssmsecret.go | 2 +- agent/utils/arn.go | 33 + agent/wsclient/client.go | 6 +- 33 files changed, 1127 insertions(+), 496 deletions(-) create mode 100644 agent/utils/arn.go diff --git a/agent/acs/handler/attach_eni_handler_common.go b/agent/acs/handler/attach_eni_handler_common.go index c41e05081fd..210d80f2e64 100644 --- a/agent/acs/handler/attach_eni_handler_common.go +++ b/agent/acs/handler/attach_eni_handler_common.go @@ -17,6 +17,9 @@ import ( "fmt" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/acs/model/ecsacs" apieni "github.com/aws/amazon-ecs-agent/agent/api/eni" "github.com/aws/amazon-ecs-agent/agent/data" @@ -118,16 +121,26 @@ func addENIAttachmentToState(attachmentType, attachmentARN, taskARN, mac string, switch attachmentType { case apieni.ENIAttachmentTypeTaskENI: - seelog.Infof("Adding task eni attachment info for task '%s' to state, attachment=%s mac=%s", - taskARN, attachmentARN, mac) + taskId, _ := utils.TaskIdFromArn(taskARN) + logger.Info("Adding eni attachment info to state for task", logger.Fields{ + field.TaskID: taskId, + "attachmentARN": attachmentARN, + "mac": mac, + }) case apieni.ENIAttachmentTypeInstanceENI: - seelog.Infof("Adding instance eni attachment info to state, attachment=%s mac=%s", attachmentARN, mac) + logger.Info("Adding instance eni attachment info to state", logger.Fields{ + "attachmentARN": attachmentARN, + "mac": mac, + }) default: return fmt.Errorf("unrecognized eni attachment type: %s", attachmentType) } state.AddENIAttachment(eniAttachment) if err := dataClient.SaveENIAttachment(eniAttachment); err != nil { - seelog.Errorf("Failed to save data for eni attachment %s: %v", eniAttachment.AttachmentARN, err) + logger.Error("Failed to save data for eni attachment", logger.Fields{ + "attachmentARN": attachmentARN, + field.Error: err, + }) } return nil } diff --git a/agent/acs/handler/payload_handler.go b/agent/acs/handler/payload_handler.go index 8d82bdabcf4..0df1b810d6d 100644 --- a/agent/acs/handler/payload_handler.go +++ b/agent/acs/handler/payload_handler.go @@ -16,6 +16,9 @@ import ( "context" "fmt" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/acs/model/ecsacs" "github.com/aws/amazon-ecs-agent/agent/api" apiappmesh "github.com/aws/amazon-ecs-agent/agent/api/appmesh" @@ -128,7 +131,10 @@ func (payloadHandler *payloadRequestHandler) ackMessageId(messageID string) { MessageId: aws.String(messageID), }) if err != nil { - seelog.Warnf("Error 'ack'ing request with messageID: %s, error: %v", messageID, err) + logger.Warn("Error ack'ing request", logger.Fields{ + "messageID": messageID, + field.Error: err, + }) } } @@ -198,6 +204,12 @@ func (payloadHandler *payloadRequestHandler) addPayloadTasks(payload *ecsacs.Pay continue } + logger.Info("Received task payload from ACS", logger.Fields{ + field.TaskARN: apiTask.Arn, + "version": apiTask.Version, + field.DesiredStatus: apiTask.GetDesiredStatus(), + }) + if task.RoleCredentials != nil { // The payload from ACS for the task has credentials for the // task. Add those to the credentials manager and set the diff --git a/agent/api/ecsclient/client.go b/agent/api/ecsclient/client.go index cb5c357d5a6..2ff65d9fe12 100644 --- a/agent/api/ecsclient/client.go +++ b/agent/api/ecsclient/client.go @@ -20,6 +20,8 @@ import ( "strings" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/api" apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" apierrors "github.com/aws/amazon-ecs-agent/agent/api/errors" @@ -587,8 +589,11 @@ func (client *APIECSClient) discoverPollEndpoint(containerInstanceArn string) (* if !expired && found { // Cache hit and not expired. Return the output. if output, ok := cachedEndpoint.(*ecs.DiscoverPollEndpointOutput); ok { - seelog.Infof("Using cached DiscoverPollEndpoint. endpoint=%s telemetryEndpoint=%s containerInstanceARN=%s", - aws.StringValue(output.Endpoint), aws.StringValue(output.TelemetryEndpoint), containerInstanceArn) + logger.Info("Using cached DiscoverPollEndpoint", logger.Fields{ + "endpoint": aws.StringValue(output.Endpoint), + "telemetryEndpoint": aws.StringValue(output.TelemetryEndpoint), + "containerInstanceARN": containerInstanceArn, + }) return output, nil } } @@ -604,8 +609,11 @@ func (client *APIECSClient) discoverPollEndpoint(containerInstanceArn string) (* // we have it. if expired { if output, ok := cachedEndpoint.(*ecs.DiscoverPollEndpointOutput); ok { - seelog.Infof("Error calling DiscoverPollEndpoint. Using cached but expired endpoint as a fallback. error=%s endpoint=%s telemetryEndpoint=%s containerInstanceARN=%s", - err, aws.StringValue(output.Endpoint), aws.StringValue(output.TelemetryEndpoint), containerInstanceArn) + logger.Info("Error calling DiscoverPollEndpoint. Using cached-but-expired endpoint as a fallback.", logger.Fields{ + "endpoint": aws.StringValue(output.Endpoint), + "telemetryEndpoint": aws.StringValue(output.TelemetryEndpoint), + "containerInstanceARN": containerInstanceArn, + }) return output, nil } } diff --git a/agent/api/eni/eniattachment.go b/agent/api/eni/eniattachment.go index 82c0c38c9ac..dccf34201b2 100644 --- a/agent/api/eni/eniattachment.go +++ b/agent/api/eni/eniattachment.go @@ -18,8 +18,11 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/utils" + "github.com/aws/amazon-ecs-agent/agent/utils/ttime" - "github.com/cihub/seelog" "github.com/pkg/errors" ) @@ -55,6 +58,25 @@ type ENIAttachment struct { guard sync.RWMutex } +func getEniAttachmentLogFields(eni *ENIAttachment, duration time.Duration) logger.Fields { + fields := logger.Fields{ + "duration": duration.String(), + "attachmentARN": eni.AttachmentARN, + "attachmentType": eni.AttachmentType, + "attachmentSent": eni.AttachStatusSent, + "mac": eni.MACAddress, + "status": eni.Status.String(), + "expiresAt": eni.ExpiresAt.Format(time.RFC3339), + } + + if eni.AttachmentType != ENIAttachmentTypeInstanceENI { + taskId, _ := utils.TaskIdFromArn(eni.TaskARN) + fields[field.TaskID] = taskId + } + + return fields +} + // StartTimer starts the ack timer to record the expiration of ENI attachment func (eni *ENIAttachment) StartTimer(timeoutFunc func()) error { eni.guard.Lock() @@ -70,7 +92,7 @@ func (eni *ENIAttachment) StartTimer(timeoutFunc func()) error { return errors.Errorf("eni attachment: timer expiration is in the past; expiration [%s] < now [%s]", eni.ExpiresAt.String(), now.String()) } - seelog.Infof("Starting ENI ack timer with duration=%s, %s", duration.String(), eni.stringUnsafe()) + logger.Info("Starting ENI ack timer", getEniAttachmentLogFields(eni, duration)) eni.ackTimer = time.AfterFunc(duration, timeoutFunc) return nil } @@ -92,7 +114,7 @@ func (eni *ENIAttachment) Initialize(timeoutFunc func()) error { return errors.New("ENI attachment has already expired") } - seelog.Infof("Starting ENI ack timer with duration=%s, %s", duration.String(), eni.stringUnsafe()) + logger.Info("Starting ENI ack timer", getEniAttachmentLogFields(eni, duration)) eni.ackTimer = time.AfterFunc(duration, timeoutFunc) return nil } diff --git a/agent/api/task/task.go b/agent/api/task/task.go index 96c02bdd2be..93958671e53 100644 --- a/agent/api/task/task.go +++ b/agent/api/task/task.go @@ -28,7 +28,6 @@ import ( "github.com/aws/amazon-ecs-agent/agent/logger/field" "github.com/aws/amazon-ecs-agent/agent/utils/ttime" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/arn" "github.com/docker/docker/api/types" "github.com/docker/go-connections/nat" @@ -54,7 +53,6 @@ import ( taskresourcevolume "github.com/aws/amazon-ecs-agent/agent/taskresource/volume" "github.com/aws/amazon-ecs-agent/agent/utils" "github.com/aws/aws-sdk-go/private/protocol/json/jsonutil" - "github.com/cihub/seelog" dockercontainer "github.com/docker/docker/api/types/container" "github.com/pkg/errors" ) @@ -83,7 +81,6 @@ const ( ContainerOrderingCreateCondition = "CREATE" ContainerOrderingStartCondition = "START" - arnResourceDelimiter = "/" // networkModeNone specifies the string used to define the `none` docker networking mode networkModeNone = "none" // dockerMappingContainerPrefix specifies the prefix string used for setting the @@ -156,6 +153,8 @@ type TaskOverrides struct{} type Task struct { // Arn is the unique identifier for the task Arn string + // id is the id section of the task ARN + id string // Overrides are the overrides applied to a task Overrides TaskOverrides `json:"-"` // Family is the name of the task definition family @@ -271,6 +270,9 @@ type Task struct { // lock is for protecting all fields in the task struct lock sync.RWMutex + + // setIdOnce is used to set the value of this task's id only the first time GetID is invoked + setIdOnce sync.Once } // TaskFromACS translates ecsacs.Task to apitask.Task by first marshaling the received @@ -330,18 +332,27 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config, task.adjustForPlatform(cfg) if task.MemoryCPULimitsEnabled { if err := task.initializeCgroupResourceSpec(cfg.CgroupPath, cfg.CgroupCPUPeriod, resourceFields); err != nil { - seelog.Errorf("Task [%s]: could not intialize resource: %v", task.Arn, err) + logger.Error("Could not initialize resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } } if err := task.initializeContainerOrderingForVolumes(); err != nil { - seelog.Errorf("Task [%s]: could not initialize volumes dependency for container: %v", task.Arn, err) + logger.Error("Could not initialize volumes dependency for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } if err := task.initializeContainerOrderingForLinks(); err != nil { - seelog.Errorf("Task [%s]: could not initialize links dependency for container: %v", task.Arn, err) + logger.Error("Could not initialize links dependency for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } @@ -356,14 +367,20 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config, } if err := task.addGPUResource(cfg); err != nil { - seelog.Errorf("Task [%s]: could not initialize GPU associations: %v", task.Arn, err) + logger.Error("Could not initialize GPU associations", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } task.initializeContainersV3MetadataEndpoint(utils.NewDynamicUUIDProvider()) task.initializeContainersV4MetadataEndpoint(utils.NewDynamicUUIDProvider()) if err := task.addNetworkResourceProvisioningDependency(cfg); err != nil { - seelog.Errorf("Task [%s]: could not provision network resource: %v", task.Arn, err) + logger.Error("Could not provision network resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } // Adds necessary Pause containers for sharing PID or IPC namespaces @@ -375,13 +392,19 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config, if task.requiresCredentialSpecResource() { if err := task.initializeCredentialSpecResource(cfg, credentialsManager, resourceFields); err != nil { - seelog.Errorf("Task [%s]: could not initialize credentialspec resource: %v", task.Arn, err) + logger.Error("Could not initialize credentialspec resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } } if err := task.initializeEnvfilesResource(cfg, credentialsManager); err != nil { - seelog.Errorf("Task [%s]: could not initialize environment files resource: %v", task.Arn, err) + logger.Error("Could not initialize environment files resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } task.populateTaskARN() @@ -389,14 +412,20 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config, // fsxWindowsFileserver is the product type -- it is technically "agnostic" ie it should apply to both Windows and Linux tasks if task.requiresFSxWindowsFileServerResource() { if err := task.initializeFSxWindowsFileServerResource(cfg, credentialsManager, resourceFields); err != nil { - seelog.Errorf("Task [%s]: could not initialize FSx for Windows File Server resource: %v", task.Arn, err) + logger.Error("Could not initialize FSx for Windows File Server resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return apierrors.NewResourceInitError(task.Arn, err) } } for _, opt := range options { if err := opt(task); err != nil { - seelog.Errorf("Task [%s]: could not apply task option: %v", task.Arn, err) + logger.Error("Could not apply task option", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return err } } @@ -670,8 +699,11 @@ func (task *Task) addSharedVolumes(SharedVolumeMatchFullConfig bool, ctx context if _, ok := volumeMetadata.Error.(*dockerapi.DockerTimeoutError); ok { return volumeMetadata.Error } - - seelog.Infof("initialize volume: Task [%s]: non-autoprovisioned volume not found, adding to task resource %q", task.Arn, vol.Name) + logger.Error("Failed to initialize non-autoprovisioned volume", logger.Fields{ + field.TaskID: task.GetID(), + field.Volume: vol.Name, + field.Error: volumeMetadata.Error, + }) // this resource should be created by agent volumeResource, err := taskresourcevolume.NewVolumeResource( ctx, @@ -690,22 +722,34 @@ func (task *Task) addSharedVolumes(SharedVolumeMatchFullConfig bool, ctx context return nil } - seelog.Infof("initialize volume: Task [%s]: volume [%s] already exists", task.Arn, volumeConfig.DockerVolumeName) + logger.Debug("Volume already exists", logger.Fields{ + field.TaskID: task.GetID(), + field.Volume: volumeConfig.DockerVolumeName, + }) if !SharedVolumeMatchFullConfig { - seelog.Infof("initialize volume: Task [%s]: ECS_SHARED_VOLUME_MATCH_FULL_CONFIG is set to false and volume with name [%s] is found", task.Arn, volumeConfig.DockerVolumeName) + logger.Info("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG is set to false and volume was found", logger.Fields{ + field.TaskID: task.GetID(), + field.Volume: volumeConfig.DockerVolumeName, + }) return nil } // validate all the volume metadata fields match to the configuration if len(volumeMetadata.DockerVolume.Labels) == 0 && len(volumeMetadata.DockerVolume.Labels) == len(volumeConfig.Labels) { - seelog.Infof("labels are both empty or null: Task [%s]: volume [%s]", task.Arn, volumeConfig.DockerVolumeName) + logger.Info("Volume labels are both empty or null", logger.Fields{ + field.TaskID: task.GetID(), + field.Volume: volumeConfig.DockerVolumeName, + }) } else if !reflect.DeepEqual(volumeMetadata.DockerVolume.Labels, volumeConfig.Labels) { return errors.Errorf("intialize volume: non-autoprovisioned volume does not match existing volume labels: existing: %v, expected: %v", volumeMetadata.DockerVolume.Labels, volumeConfig.Labels) } if len(volumeMetadata.DockerVolume.Options) == 0 && len(volumeMetadata.DockerVolume.Options) == len(volumeConfig.DriverOpts) { - seelog.Infof("driver options are both empty or null: Task [%s]: volume [%s]", task.Arn, volumeConfig.DockerVolumeName) + logger.Info("Volume driver options are both empty or null", logger.Fields{ + field.TaskID: task.GetID(), + field.Volume: volumeConfig.DockerVolumeName, + }) } else if !reflect.DeepEqual(volumeMetadata.DockerVolume.Options, volumeConfig.DriverOpts) { return errors.Errorf("initialize volume: non-autoprovisioned volume does not match existing volume options: existing: %v, expected: %v", volumeMetadata.DockerVolume.Options, volumeConfig.DriverOpts) @@ -742,7 +786,9 @@ func (task *Task) initializeCredentialsEndpoint(credentialsManager credentials.M // the id. This should never happen as the payload handler sets // credentialsId for the task after adding credentials to the // credentials manager - seelog.Errorf("Unable to get credentials for task: %s", task.Arn) + logger.Error("Unable to get credentials for task", logger.Fields{ + field.TaskID: task.GetID(), + }) return } @@ -1021,7 +1067,10 @@ func (task *Task) addFirelensContainerDependency() error { if firelensContainer.HasContainerDependencies() { // If firelens container has any container dependency, we don't add internal container dependency that depends // on it in order to be safe (otherwise we need to deal with circular dependency). - seelog.Warnf("Not adding container dependency to let firelens container %s start first, because it has dependency on other containers.", firelensContainer.Name) + logger.Warn("Not adding container dependency to let firelens container start first since it has dependency on other containers.", logger.Fields{ + field.TaskID: task.GetID(), + "firelensContainer": firelensContainer.Name, + }) return nil } @@ -1045,8 +1094,11 @@ func (task *Task) addFirelensContainerDependency() error { // If there's no dependency between the app container and the firelens container, make firelens container // start first to be the default behavior by adding a START container depdendency. if !container.DependsOnContainer(firelensContainer.Name) { - seelog.Infof("Adding a START container dependency on firelens container %s for container %s", - firelensContainer.Name, container.Name) + logger.Info("Adding a START container dependency on firelens for container", logger.Fields{ + field.TaskID: task.GetID(), + "firelensContainer": firelensContainer.Name, + field.Container: container.Name, + }) container.AddContainerDependency(firelensContainer.Name, ContainerOrderingStartCondition) } } @@ -1125,10 +1177,7 @@ func (task *Task) collectFirelensLogEnvOptions(containerToLogOptions map[string] // container's host config. func (task *Task) AddFirelensContainerBindMounts(firelensConfig *apicontainer.FirelensConfig, hostConfig *dockercontainer.HostConfig, config *config.Config) *apierrors.HostConfigError { - taskID, err := task.GetID() - if err != nil { - return &apierrors.HostConfigError{Msg: err.Error()} - } + taskID := task.GetID() var configBind, s3ConfigBind, socketBind string switch firelensConfig.Type { @@ -1220,7 +1269,10 @@ func (task *Task) addNetworkResourceProvisioningDependency(cfg *config.Config) e for _, resource := range task.GetResources() { if resource.DependOnTaskNetwork() { - seelog.Debugf("Task [%s]: adding network pause container dependency to resource [%s]", task.Arn, resource.GetName()) + logger.Debug("Adding network pause container dependency to resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Resource: resource.GetName(), + }) resource.BuildContainerDependency(NetworkPauseContainerName, apicontainerstatus.ContainerResourcesProvisioned, resourcestatus.ResourceStatus(taskresourcevolume.VolumeCreated)) } } @@ -1296,15 +1348,16 @@ func (task *Task) UpdateMountPoints(cont *apicontainer.Container, vols []types.M // there was no change // Invariant: task known status is the minimum of container known status func (task *Task) updateTaskKnownStatus() (newStatus apitaskstatus.TaskStatus) { - seelog.Debugf("api/task: Updating task's known status, task: %s", task.String()) + logger.Debug("Updating task's known status", logger.Fields{ + field.TaskID: task.GetID(), + }) // Set to a large 'impossible' status that can't be the min containerEarliestKnownStatus := apicontainerstatus.ContainerZombie - var earliestKnownStatusContainer *apicontainer.Container - essentialContainerStopped := false + var earliestKnownStatusContainer, essentialContainerStopped *apicontainer.Container for _, container := range task.Containers { containerKnownStatus := container.GetKnownStatus() if containerKnownStatus == apicontainerstatus.ContainerStopped && container.Essential { - essentialContainerStopped = true + essentialContainerStopped = container } if containerKnownStatus < containerEarliestKnownStatus { containerEarliestKnownStatus = containerKnownStatus @@ -1312,19 +1365,25 @@ func (task *Task) updateTaskKnownStatus() (newStatus apitaskstatus.TaskStatus) { } } if earliestKnownStatusContainer == nil { - seelog.Criticalf( - "Impossible state found while updating tasks's known status, earliest state recorded as %s for task [%v]", - containerEarliestKnownStatus.String(), task) + logger.Critical("Impossible state found while updating tasks's known status", logger.Fields{ + field.TaskID: task.GetID(), + "earliestKnownStatus": containerEarliestKnownStatus.String(), + }) return apitaskstatus.TaskStatusNone } - seelog.Debugf("api/task: Container with earliest known container is [%s] for task: %s", - earliestKnownStatusContainer.String(), task.String()) + logger.Debug("Found container with earliest known status", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: earliestKnownStatusContainer.Name, + field.KnownStatus: earliestKnownStatusContainer.GetKnownStatus(), + field.DesiredStatus: earliestKnownStatusContainer.GetDesiredStatus(), + }) // If the essential container is stopped while other containers may be running // don't update the task status until the other containers are stopped. - if earliestKnownStatusContainer.IsKnownSteadyState() && essentialContainerStopped { - seelog.Debugf( - "Essential container is stopped while other containers are running, not updating task status for task: %s", - task.String()) + if earliestKnownStatusContainer.IsKnownSteadyState() && essentialContainerStopped != nil { + logger.Debug("Essential container is stopped while other containers are running, not updating task status", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: essentialContainerStopped.Name, + }) return apitaskstatus.TaskStatusNone } // We can't rely on earliest container known status alone for determining if the @@ -1335,7 +1394,7 @@ func (task *Task) updateTaskKnownStatus() (newStatus apitaskstatus.TaskStatus) { if task.GetKnownStatus() < earliestKnownTaskStatus { task.SetKnownStatus(earliestKnownTaskStatus) logger.Info("Container change also resulted in task change", logger.Fields{ - field.TaskARN: task.Arn, + field.TaskID: task.GetID(), field.Container: earliestKnownStatusContainer.Name, field.RuntimeID: earliestKnownStatusContainer.RuntimeID, field.DesiredStatus: task.GetDesiredStatus().String(), @@ -1350,7 +1409,9 @@ func (task *Task) updateTaskKnownStatus() (newStatus apitaskstatus.TaskStatus) { // based on the known statuses of all containers in the task func (task *Task) getEarliestKnownTaskStatusForContainers() apitaskstatus.TaskStatus { if len(task.Containers) == 0 { - seelog.Criticalf("No containers in the task: %s", task.String()) + logger.Critical("No containers in the task", logger.Fields{ + field.TaskID: task.GetID(), + }) return apitaskstatus.TaskStatusNone } // Set earliest container status to an impossible to reach 'high' task status @@ -1539,13 +1600,21 @@ func (task *Task) overrideContainerRuntime(container *apicontainer.Container, ho if task.NvidiaRuntime == "" { return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"} } - seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name) + logger.Debug("Setting runtime for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "runTime": task.NvidiaRuntime, + }) hostCfg.Runtime = task.NvidiaRuntime } } if cfg.InferentiaSupportEnabled && container.RequireNeuronRuntime() { - seelog.Debugf("Setting runtime as %s for container %s", neuronRuntime, container.Name) + logger.Debug("Setting runtime for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "runTime": neuronRuntime, + }) hostCfg.Runtime = neuronRuntime } return nil @@ -1556,8 +1625,11 @@ func (task *Task) getDockerResources(container *apicontainer.Container, cfg *con // Convert MB to B and set Memory dockerMem := int64(container.Memory * 1024 * 1024) if dockerMem != 0 && dockerMem < apicontainer.DockerContainerMinimumMemoryInBytes { - seelog.Warnf("Task %s container %s memory setting is too low, increasing to %d bytes", - task.Arn, container.Name, apicontainer.DockerContainerMinimumMemoryInBytes) + logger.Warn("Memory setting too low for container, increasing to minimum", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "bytes": apicontainer.DockerContainerMinimumMemoryInBytes, + }) dockerMem = apicontainer.DockerContainerMinimumMemoryInBytes } // Set CPUShares @@ -1607,14 +1679,18 @@ func (task *Task) shouldOverrideNetworkMode(container *apicontainer.Container, d } } if pauseContName == "" { - seelog.Critical("Pause container required, but not found in the task: %s", task.String()) + logger.Critical("Pause container required, but not found in the task", logger.Fields{ + field.TaskID: task.GetID(), + }) return false, "" } pauseContainer, ok := dockerContainerMap[pauseContName] if !ok || pauseContainer == nil { // This should never be the case and implies a code-bug. - seelog.Criticalf("Pause container required, but not found in container map for container: [%s] in task: %s", - container.String(), task.String()) + logger.Critical("Pause container required, but not found in container map", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) return false, "" } return true, dockerMappingContainerPrefix + pauseContainer.DockerID @@ -1704,14 +1780,18 @@ func (task *Task) shouldOverridePIDMode(container *apicontainer.Container, docke case pidModeTask: pauseCont, ok := task.ContainerByName(NamespacePauseContainerName) if !ok { - seelog.Criticalf("Namespace Pause container not found in the task: %s; Setting Task's Desired Status to Stopped", task.Arn) + logger.Critical("Namespace Pause container not found; stopping task", logger.Fields{ + field.TaskID: task.GetID(), + }) task.SetDesiredStatus(apitaskstatus.TaskStopped) return false, "" } pauseDockerID, ok := dockerContainerMap[pauseCont.Name] if !ok || pauseDockerID == nil { // Docker container shouldn't be nil or not exist if the Container definition within task exists; implies code-bug - seelog.Criticalf("Namespace Pause docker container not found in the task: %s; Setting Task's Desired Status to Stopped", task.Arn) + logger.Critical("Namespace Pause docker container not found; stopping task", logger.Fields{ + field.TaskID: task.GetID(), + }) task.SetDesiredStatus(apitaskstatus.TaskStopped) return false, "" } @@ -1757,14 +1837,18 @@ func (task *Task) shouldOverrideIPCMode(container *apicontainer.Container, docke case ipcModeTask: pauseCont, ok := task.ContainerByName(NamespacePauseContainerName) if !ok { - seelog.Criticalf("Namespace Pause container not found in the task: %s; Setting Task's Desired Status to Stopped", task.Arn) + logger.Critical("Namespace Pause container not found; stopping task", logger.Fields{ + field.TaskID: task.GetID(), + }) task.SetDesiredStatus(apitaskstatus.TaskStopped) return false, "" } pauseDockerID, ok := dockerContainerMap[pauseCont.Name] if !ok || pauseDockerID == nil { // Docker container shouldn't be nill or not exist if the Container definition within task exists; implies code-bug - seelog.Criticalf("Namespace Pause container not found in the task: %s; Setting Task's Desired Status to Stopped", task.Arn) + logger.Critical("Namespace Pause container not found; stopping task", logger.Fields{ + field.TaskID: task.GetID(), + }) task.SetDesiredStatus(apitaskstatus.TaskStopped) return false, "" } @@ -1823,8 +1907,11 @@ func (task *Task) dockerLinks(container *apicontainer.Container, dockerContainer if len(linkParts) == 2 { linkAlias = linkParts[1] } else { - seelog.Warnf("Link name [%s] found with no linkalias for container: [%s] in task: [%s]", - linkName, container.String(), task.String()) + logger.Warn("Link name found with no linkalias for container", logger.Fields{ + field.TaskID: task.GetID(), + "link": linkName, + field.Container: container.Name, + }) linkAlias = linkName } @@ -1883,9 +1970,13 @@ func (task *Task) dockerHostBinds(container *apicontainer.Container) ([]string, } if hv.Source() == "" || mountPoint.ContainerPath == "" { - seelog.Errorf( - "Unable to resolve volume mounts for container [%s]; invalid path: [%s]; [%s] -> [%s] in task: [%s]", - container.Name, mountPoint.SourceVolume, hv.Source(), mountPoint.ContainerPath, task.String()) + logger.Error("Unable to resolve volume mounts for container; invalid path", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Volume: mountPoint.SourceVolume, + "path": hv.Source(), + "containerPath": mountPoint.ContainerPath, + }) return []string{}, errors.Errorf("Unable to resolve volume mounts; invalid path: %s %s; %s -> %s", container.Name, mountPoint.SourceVolume, hv.Source(), mountPoint.ContainerPath) } @@ -1922,7 +2013,10 @@ func (task *Task) UpdateDesiredStatus() { // updateTaskDesiredStatusUnsafe determines what status the task should properly be at based on the containers' statuses // Invariant: task desired status must be stopped if any essential container is stopped func (task *Task) updateTaskDesiredStatusUnsafe() { - seelog.Debugf("Updating task: [%s]", task.stringUnsafe()) + logger.Debug("Updating task's desired status", logger.Fields{ + field.TaskID: task.GetID(), + field.KnownStatus: task.KnownStatusUnsafe.String(), + }) // A task's desired status is stopped if any essential container is stopped // Otherwise, the task's desired status is unchanged (typically running, but no need to change) @@ -1931,8 +2025,12 @@ func (task *Task) updateTaskDesiredStatusUnsafe() { break } if cont.Essential && (cont.KnownTerminal() || cont.DesiredTerminal()) { - seelog.Infof("api/task: Updating task desired status to stopped because of container: [%s]; task: [%s]", - cont.Name, task.stringUnsafe()) + logger.Info("Essential container stopped; updating task desired status to stopped", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Name, + field.KnownStatus: task.KnownStatusUnsafe.String(), + field.DesiredStatus: apitaskstatus.TaskStopped.String(), + }) task.DesiredStatusUnsafe = apitaskstatus.TaskStopped } } @@ -2222,22 +2320,19 @@ func (task *Task) stringUnsafe() string { // GetID is used to retrieve the taskID from taskARN // Reference: http://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html#arn-syntax-ecs -func (task *Task) GetID() (string, error) { - // Parse taskARN - parsedARN, err := arn.Parse(task.Arn) - if err != nil { - return "", errors.Wrapf(err, "task get-id: malformed taskARN: %s", task.Arn) - } - - // Get task resource section - resource := parsedARN.Resource - - if !strings.Contains(resource, arnResourceDelimiter) { - return "", errors.Errorf("task get-id: malformed task resource: %s", resource) - } +func (task *Task) GetID() string { + task.setIdOnce.Do(func() { + id, err := utils.TaskIdFromArn(task.Arn) + if err != nil { + logger.Error("Error getting ID for task", logger.Fields{ + field.TaskARN: task.Arn, + field.Error: err, + }) + } + task.id = id + }) - resourceSplit := strings.Split(resource, arnResourceDelimiter) - return resourceSplit[len(resourceSplit)-1], nil + return task.id } // RecordExecutionStoppedAt checks if this is an essential container stopped @@ -2256,8 +2351,11 @@ func (task *Task) RecordExecutionStoppedAt(container *apicontainer.Container) { // ExecutionStoppedAt was already recorded. Nothing to left to do here return } - seelog.Infof("Task [%s]: recording execution stopped time. Essential container [%s] stopped at: %s", - task.Arn, container.Name, now.String()) + logger.Info("Essential container stopped; recording task stopped time", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Time: now.String(), + }) } // GetResources returns the list of task resources from ResourcesMap @@ -2286,10 +2384,11 @@ func (task *Task) AddResource(resourceType string, resource taskresource.TaskRes // SetTerminalReason sets the terminalReason string and this can only be set // once per the task's lifecycle. This field does not accept updates. func (task *Task) SetTerminalReason(reason string) { - seelog.Infof("Task [%s]: attempting to set terminal reason for task [%s]", task.Arn, reason) task.terminalReasonOnce.Do(func() { - seelog.Infof("Task [%s]: setting terminal reason for task [%s]", task.Arn, reason) - + logger.Info("Setting terminal reason for task", logger.Fields{ + field.TaskID: task.GetID(), + field.Reason: reason, + }) // Converts the first letter of terminal reason into capital letter words := strings.Fields(reason) words[0] = strings.Title(words[0]) diff --git a/agent/api/task/task_linux.go b/agent/api/task/task_linux.go index 400162682b6..605cb7ab214 100644 --- a/agent/api/task/task_linux.go +++ b/agent/api/task/task_linux.go @@ -20,6 +20,8 @@ import ( "path/filepath" "time" + "github.com/aws/amazon-ecs-agent/agent/utils" + apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" apieni "github.com/aws/amazon-ecs-agent/agent/api/eni" "github.com/aws/amazon-ecs-agent/agent/config" @@ -78,11 +80,10 @@ func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPerio // BuildCgroupRoot helps build the task cgroup prefix // Example: /ecs/task-id func (task *Task) BuildCgroupRoot() (string, error) { - taskID, err := task.GetID() + taskID, err := utils.TaskIdFromArn(task.Arn) if err != nil { - return "", errors.Wrapf(err, "task build cgroup root: unable to get task-id from task ARN: %s", task.Arn) + return "", err } - return filepath.Join(config.DefaultTaskCgroupPrefix, taskID), nil } diff --git a/agent/api/task/task_test.go b/agent/api/task/task_test.go index bc80206ef54..f25a11f2c16 100644 --- a/agent/api/task/task_test.go +++ b/agent/api/task/task_test.go @@ -1640,30 +1640,6 @@ func assertSetStructFieldsEqual(t *testing.T, expected, actual interface{}) { } } -// TestGetIDErrorPaths performs table tests on GetID with erroneous taskARNs -func TestGetIDErrorPaths(t *testing.T) { - testCases := []struct { - arn string - name string - }{ - {"", "EmptyString"}, - {"invalidArn", "InvalidARN"}, - {"arn:aws:ecs:region:account-id:task:task-id", "IncorrectSections"}, - {"arn:aws:ecs:region:account-id:task", "IncorrectResourceSections"}, - } - - task := Task{} - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - task.Arn = tc.arn - taskID, err := task.GetID() - assert.Error(t, err, "GetID should return an error") - assert.Empty(t, taskID, "ID should be empty") - }) - } -} - // TestGetIDHappyPath validates the happy path of GetID func TestGetIDHappyPath(t *testing.T) { taskNormalARN := Task{ @@ -1673,14 +1649,11 @@ func TestGetIDHappyPath(t *testing.T) { Arn: "arn:aws:ecs:region:account-id:task/cluster-name/task-id", } - taskID, err := taskNormalARN.GetID() - assert.NoError(t, err) + taskID := taskNormalARN.GetID() assert.Equal(t, "task-id", taskID) - taskID, err = taskLongARN.GetID() - assert.NoError(t, err) + taskID = taskLongARN.GetID() assert.Equal(t, "task-id", taskID) - } // TestTaskGetPrimaryENI tests the eni can be correctly acquired by calling GetTaskPrimaryENI diff --git a/agent/app/agent.go b/agent/app/agent.go index bfe71daa52a..91658edcd14 100644 --- a/agent/app/agent.go +++ b/agent/app/agent.go @@ -20,6 +20,9 @@ import ( "fmt" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/doctor" "github.com/aws/amazon-ecs-agent/agent/eni/watcher" "github.com/aws/aws-sdk-go/aws/awserr" @@ -151,8 +154,11 @@ func newAgent(blackholeEC2Metadata bool, acceptInsecureCert *bool) (agent, error if blackholeEC2Metadata { ec2MetadataClient = ec2.NewBlackholeEC2MetadataClient() } - - seelog.Info("Loading configuration") + logger.Info("Starting Amazon ECS Agent", logger.Fields{ + "version": version.Version, + "commit": version.GitShortHash, + }) + logger.Info("Loading configuration") cfg, err := config.NewConfig(ec2MetadataClient) if err != nil { // All required config values can be inferred from EC2 Metadata, @@ -165,11 +171,10 @@ func newAgent(blackholeEC2Metadata bool, acceptInsecureCert *bool) (agent, error if cfg.AcceptInsecureCert { seelog.Warn("SSL certificate verification disabled. This is not recommended.") } - seelog.Infof("Amazon ECS agent Version: %s, Commit: %s", version.Version, version.GitShortHash) seelog.Debugf("Loaded config: %s", cfg.String()) if cfg.External.Enabled() { - seelog.Info("Running in external mode.") + logger.Info("ECS Agent is running in external mode.") ec2MetadataClient = ec2.NewBlackholeEC2MetadataClient() cfg.NoIID = true } @@ -179,7 +184,9 @@ func newAgent(blackholeEC2Metadata bool, acceptInsecureCert *bool) (agent, error if err != nil { // This is also non terminal in the current config - seelog.Criticalf("Error creating Docker client: %v", err) + logger.Critical("Error creating Docker client", logger.Fields{ + field.Error: err, + }) cancel() return nil, err } @@ -188,7 +195,9 @@ func newAgent(blackholeEC2Metadata bool, acceptInsecureCert *bool) (agent, error if cfg.Checkpoint.Enabled() { dataClient, err = data.New(cfg.DataDir) if err != nil { - seelog.Criticalf("Error creating data client: %v", err) + logger.Critical("Error creating Docker client", logger.Fields{ + field.Error: err, + }) cancel() return nil, err } @@ -572,11 +581,17 @@ func (agent *ecsAgent) setClusterInConfig(previousCluster string) error { err := clusterMismatchError{ fmt.Errorf(clusterMismatchErrorFormat, previousCluster, configuredCluster), } - seelog.Criticalf("%v", err) + logger.Critical("Error restoring cluster", logger.Fields{ + "previousCluster": previousCluster, + "configuredCluster": configuredCluster, + field.Error: err, + }) return err } agent.cfg.Cluster = previousCluster - seelog.Infof("Restored cluster '%s'", agent.cfg.Cluster) + logger.Info("Cluster was successfully restored", logger.Fields{ + "cluster": agent.cfg.Cluster, + }) return nil } @@ -596,8 +611,9 @@ func (agent *ecsAgent) getEC2InstanceID() string { } } if err != nil { - seelog.Warnf( - "Unable to access EC2 Metadata service to determine EC2 ID: %v", err) + logger.Warn("Unable to access EC2 Metadata service to determine EC2 ID", logger.Fields{ + field.Error: err, + }) } return instanceID } @@ -687,20 +703,27 @@ func (agent *ecsAgent) registerContainerInstance( outpostARN := agent.getoutpostARN() if agent.containerInstanceARN != "" { - seelog.Infof("Restored from checkpoint file. I am running as '%s' in cluster '%s'", agent.containerInstanceARN, agent.cfg.Cluster) + logger.Info("Restored from checkpoint file", logger.Fields{ + "containerInstanceARN": agent.containerInstanceARN, + "cluster": agent.cfg.Cluster, + }) return agent.reregisterContainerInstance(client, capabilities, tags, uuid.New(), platformDevices, outpostARN) } - seelog.Info("Registering Instance with ECS") + logger.Info("Registering Instance with ECS") containerInstanceArn, availabilityZone, err := client.RegisterContainerInstance("", capabilities, tags, uuid.New(), platformDevices, outpostARN) if err != nil { - seelog.Errorf("Error registering: %v", err) + logger.Error("Error registering container instance", logger.Fields{ + field.Error: err, + }) if retriable, ok := err.(apierrors.Retriable); ok && !retriable.Retry() { return err } if utils.IsAWSErrorCodeEqual(err, ecs.ErrCodeInvalidParameterException) { - seelog.Critical("Instance registration attempt with an invalid parameter") + logger.Critical("Instance registration attempt with an invalid parameter", logger.Fields{ + field.Error: err, + }) return err } if _, ok := err.(apierrors.AttributeError); ok { @@ -708,12 +731,17 @@ func (agent *ecsAgent) registerContainerInstance( if len(agent.cfg.InstanceAttributes) > 0 { attributeErrorMsg = customAttributeErrorMessage } - seelog.Critical("Instance registration attempt with invalid attribute(s)." + attributeErrorMsg) + logger.Critical("Instance registration attempt with invalid attribute(s)", logger.Fields{ + field.Error: attributeErrorMsg, + }) return err } return transientError{err} } - seelog.Infof("Registration completed successfully. I am running as '%s' in cluster '%s'", containerInstanceArn, agent.cfg.Cluster) + logger.Info("Instance registration completed successfully", logger.Fields{ + "instanceArn": containerInstanceArn, + "cluster": agent.cfg.Cluster, + }) agent.containerInstanceARN = containerInstanceArn agent.availabilityZone = availabilityZone return nil @@ -733,7 +761,9 @@ func (agent *ecsAgent) reregisterContainerInstance(client api.ECSClient, capabil if err == nil { return nil } - seelog.Errorf("Error re-registering: %v", err) + logger.Error("Error re-registering container instance", logger.Fields{ + field.Error: err, + }) if apierrors.IsInstanceTypeChangedError(err) { seelog.Criticalf(instanceTypeMismatchErrorFormat, err) return err @@ -743,7 +773,9 @@ func (agent *ecsAgent) reregisterContainerInstance(client api.ECSClient, capabil if len(agent.cfg.InstanceAttributes) > 0 { attributeErrorMsg = customAttributeErrorMessage } - seelog.Critical("Instance re-registration attempt with invalid attribute(s)." + attributeErrorMsg) + logger.Critical("Instance re-registration attempt with invalid attribute(s)", logger.Fields{ + field.Error: attributeErrorMsg, + }) return err } return transientError{err} diff --git a/agent/dockerclient/dockerapi/docker_client.go b/agent/dockerclient/dockerapi/docker_client.go index e014ceaa81b..c5781393666 100644 --- a/agent/dockerclient/dockerapi/docker_client.go +++ b/agent/dockerclient/dockerapi/docker_client.go @@ -28,6 +28,9 @@ import ( "sync/atomic" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" apierrors "github.com/aws/amazon-ecs-agent/agent/api/errors" @@ -1414,7 +1417,9 @@ func (dg *dockerGoClient) Stats(ctx context.Context, id string, inactivityTimeou var resp types.ContainerStats if !dg.config.PollMetrics.Enabled() { // Streaming metrics is the default behavior - seelog.Infof("DockerGoClient: Starting streaming metrics for container %s", id) + logger.Info("Start streaming metrics for container", logger.Fields{ + field.RuntimeID: id, + }) go func() { defer cancelRequest() defer close(statsC) diff --git a/agent/engine/common_test.go b/agent/engine/common_test.go index b6945e04248..5e160117e50 100644 --- a/agent/engine/common_test.go +++ b/agent/engine/common_test.go @@ -252,7 +252,6 @@ func addTaskToEngine(t *testing.T, taskEngine.AddTask(sleepTask) waitForRunningEvents(t, taskEngine.StateChangeEvents()) - // Wait for all events to be consumed prior to moving it towards stopped; we // don't want to race the below with these or we'll end up with the "going // backwards in state" stop and we haven't 'expect'd for that diff --git a/agent/engine/docker_image_manager.go b/agent/engine/docker_image_manager.go index 3031cbbadef..77b8ae94903 100644 --- a/agent/engine/docker_image_manager.go +++ b/agent/engine/docker_image_manager.go @@ -21,6 +21,9 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/docker/docker/api/types" apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" @@ -102,7 +105,9 @@ func buildImageCleanupExclusionList(cfg *config.Config) []string { config.CachedImageNameAgentContainer, ) for _, image := range excludedImages { - seelog.Infof("Image excluded from cleanup: %s", image) + logger.Info("Image excluded from cleanup", logger.Fields{ + field.Image: image, + }) } return excludedImages } diff --git a/agent/engine/docker_task_engine.go b/agent/engine/docker_task_engine.go index 5bcb48209b6..69d1e2d0d41 100644 --- a/agent/engine/docker_task_engine.go +++ b/agent/engine/docker_task_engine.go @@ -25,6 +25,8 @@ import ( "sync" "time" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/amazon-ecs-agent/agent/logger" "github.com/aws/amazon-ecs-agent/agent/logger/field" @@ -56,7 +58,6 @@ import ( "github.com/aws/amazon-ecs-agent/agent/utils/ttime" dockercontainer "github.com/docker/docker/api/types/container" - "github.com/cihub/seelog" "github.com/docker/docker/api/types" "github.com/pkg/errors" ) @@ -319,7 +320,10 @@ func (engine *DockerTaskEngine) monitorExecAgentRunning(ctx context.Context, task := mTask.Task dockerID, err := engine.getDockerID(task, c) if err != nil { - seelog.Errorf("Task engine [%s]: Could not retrieve docker id for container", task.Arn) + logger.Error("Could not retrieve docker id for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: c.Name, + }) return } // Sleeping here so that all the containers do not call inspect/start exec agent process @@ -330,7 +334,12 @@ func (engine *DockerTaskEngine) monitorExecAgentRunning(ctx context.Context, time.Sleep(retry.AddJitter(time.Nanosecond, engine.monitorExecAgentsInterval/2)) status, err := engine.execCmdMgr.RestartAgentIfStopped(ctx, engine.client, task, c, dockerID) if err != nil { - seelog.Errorf("Task engine [%s]: Failed to restart ExecCommandAgent Process for container [%s]: %v", task.Arn, dockerID, err) + logger.Error("Failed to restart ExecCommandAgent Process for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: c.Name, + field.DockerId: dockerID, + field.Error: err, + }) mTask.emitManagedAgentEvent(mTask.Task, c, execcmd.ExecuteCommandAgentName, "ExecuteCommandAgent cannot be restarted") } if status == execcmd.Restarted { @@ -357,7 +366,9 @@ func (engine *DockerTaskEngine) MustInit(ctx context.Context) { err := engine.Init(ctx) if err != nil { errorOnce.Do(func() { - seelog.Errorf("Task engine: could not connect to docker daemon: %v", err) + logger.Error("Could not connect to docker daemon", logger.Fields{ + field.Error: err, + }) }) } return err @@ -409,11 +420,19 @@ func (engine *DockerTaskEngine) synchronizeState() { timeoutFunc := func() { eniAttachment, ok := engine.state.ENIByMac(eniAttachment.MACAddress) if !ok { - seelog.Warnf("Ignoring unmanaged ENI attachment with MAC address: %s", eniAttachment.MACAddress) + logger.Warn("Ignoring unmanaged ENI attachment", logger.Fields{ + "macAddress": eniAttachment.MACAddress, + }) return } if !eniAttachment.IsSent() { - seelog.Warnf("Timed out waiting for ENI ack; removing ENI attachment record %s", eniAttachment.String()) + logger.Warn("Timed out waiting for ENI ack; removing ENI attachment record", logger.Fields{ + field.TaskARN: eniAttachment.TaskARN, + field.Status: eniAttachment.Status.String(), + "attachmentArn": eniAttachment.AttachmentARN, + "expiresAt": eniAttachment.ExpiresAt.Format(time.RFC3339), + "type": eniAttachment.AttachmentType, + }) engine.removeENIAttachmentData(eniAttachment.MACAddress) engine.state.RemoveENIAttachment(eniAttachment.MACAddress) } @@ -422,7 +441,13 @@ func (engine *DockerTaskEngine) synchronizeState() { if err != nil { // The only case where we get an error from Initialize is that the attachment has expired. In that case, remove the expired // attachment from state. - seelog.Warnf("ENI attachment has expired. Removing it from state. %s", eniAttachment.String()) + logger.Warn("ENI attachment has expired; removing it from state", logger.Fields{ + field.TaskARN: eniAttachment.TaskARN, + field.Status: eniAttachment.Status.String(), + "attachmentArn": eniAttachment.AttachmentARN, + "expiresAt": eniAttachment.ExpiresAt.Format(time.RFC3339), + "type": eniAttachment.AttachmentType, + }) engine.removeENIAttachmentData(eniAttachment.MACAddress) engine.state.RemoveENIAttachment(eniAttachment.MACAddress) } @@ -509,14 +534,19 @@ func updateContainerMetadata(metadata *dockerapi.DockerContainerMetadata, contai // synchronizeContainerStatus checks and updates the container status with docker func (engine *DockerTaskEngine) synchronizeContainerStatus(container *apicontainer.DockerContainer, task *apitask.Task) { if container.DockerID == "" { - seelog.Debugf("Task engine [%s]: found container potentially created while we were down: %s", - task.Arn, container.DockerName) + logger.Debug("Found container potentially created while we were down", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + }) // Figure out the dockerid describedContainer, err := engine.client.InspectContainer(engine.ctx, container.DockerName, dockerclient.InspectContainerTimeout) if err != nil { - seelog.Warnf("Task engine [%s]: could not find matching container for expected name [%s]: %v", - task.Arn, container.DockerName, err) + logger.Warn("Error getting container with name", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + field.Error: err, + }) } else { // update the container metadata in case the container was created during agent restart metadata := dockerapi.MetadataFromContainer(describedContainer) @@ -528,8 +558,11 @@ func (engine *DockerTaskEngine) synchronizeContainerStatus(container *apicontain engine.state.AddContainer(container, task) err := engine.imageManager.RecordContainerReference(container.Container) if err != nil { - seelog.Warnf("Task engine [%s]: unable to add container reference to image state: %v", - task.Arn, err) + logger.Warn("Unable to add container reference to image state", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + field.Error: err, + }) } } return @@ -540,14 +573,22 @@ func (engine *DockerTaskEngine) synchronizeContainerStatus(container *apicontain currentState = apicontainerstatus.ContainerStopped // If this is a Docker API error if metadata.Error.ErrorName() == dockerapi.CannotDescribeContainerErrorName { - seelog.Warnf("Task engine [%s]: could not describe previously known container [id=%s; name=%s]; assuming dead: %v", - task.Arn, container.DockerID, container.DockerName, metadata.Error) + logger.Warn("Could not describe previously known container; assuming dead", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + field.DockerId: container.DockerID, + field.Error: metadata.Error, + }) if !container.Container.KnownTerminal() { container.Container.ApplyingError = apierrors.NewNamedError(&ContainerVanishedError{}) err := engine.imageManager.RemoveContainerReferenceFromImageState(container.Container) if err != nil { - seelog.Warnf("Task engine [%s]: could not remove container reference for image state %s: %v", - container.Container.Image, err) + logger.Warn("Could not remove container reference from image state", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + "image": container.Container.Image, + field.Error: err, + }) } } } else { @@ -560,8 +601,11 @@ func (engine *DockerTaskEngine) synchronizeContainerStatus(container *apicontain updateContainerMetadata(&metadata, container.Container, task) err := engine.imageManager.RecordContainerReference(container.Container) if err != nil { - seelog.Warnf("Task engine [%s]: unable to add container reference to image state: %v", - task.Arn, err) + logger.Warn("Unable to add container reference to image state", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.DockerName, + field.Error: err, + }) } if engine.cfg.ContainerMetadataEnabled.Enabled() && !container.Container.IsMetadataFileUpdated() { go engine.updateMetadataFile(task, container) @@ -606,8 +650,11 @@ func (engine *DockerTaskEngine) sweepTask(task *apitask.Task) { for _, cont := range task.Containers { err := engine.removeContainer(task, cont) if err != nil { - seelog.Infof("Task engine [%s]: unable to remove old container [%s]: %v", - task.Arn, cont.Name, err) + logger.Error("Unable to remove old container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Name, + field.Error: err, + }) } // Internal container(created by ecs-agent) state isn't recorded if cont.IsInternal() { @@ -615,8 +662,11 @@ func (engine *DockerTaskEngine) sweepTask(task *apitask.Task) { } err = engine.imageManager.RemoveContainerReferenceFromImageState(cont) if err != nil { - seelog.Errorf("Task engine [%s]: Unable to remove container [%s] reference from image state: %v", - task.Arn, cont.Name, err) + logger.Error("Unable to remove container reference from image state", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Name, + field.Error: err, + }) } } @@ -624,7 +674,10 @@ func (engine *DockerTaskEngine) sweepTask(task *apitask.Task) { if engine.cfg.ContainerMetadataEnabled.Enabled() { err := engine.metadataManager.Clean(task.Arn) if err != nil { - seelog.Warnf("Task engine [%s]: clean task metadata failed: %v", task.Arn, err) + logger.Warn("Error cleaning task metadata", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) } } } @@ -635,22 +688,27 @@ func (engine *DockerTaskEngine) deleteTask(task *apitask.Task) { for _, resource := range task.GetResources() { err := resource.Cleanup() if err != nil { - seelog.Warnf("Task engine [%s]: unable to cleanup resource %s: %v", - task.Arn, resource.GetName(), err) + logger.Warn("Unable to cleanup resource", logger.Fields{ + field.TaskID: task.GetID(), + field.Resource: resource.GetName(), + field.Error: err, + }) } else { - seelog.Infof("Task engine [%s]: resource %s cleanup complete", task.Arn, - resource.GetName()) + logger.Info("Resource cleanup complete", logger.Fields{ + field.TaskID: task.GetID(), + field.Resource: resource.GetName(), + }) } } if execcmd.IsExecEnabledTask(task) { // cleanup host exec agent log dirs - if tID, err := task.GetID(); err != nil { - seelog.Warnf("Task Engine[%s]: error getting task ID for ExecAgent logs cleanup: %v", task.Arn, err) - } else { - if err := removeAll(filepath.Join(execcmd.ECSAgentExecLogDir, tID)); err != nil { - seelog.Warnf("Task Engine[%s]: unable to remove ExecAgent host logs for task: %v", task.Arn, err) - } + tID := task.GetID() + if err := removeAll(filepath.Join(execcmd.ECSAgentExecLogDir, tID)); err != nil { + logger.Warn("Unable to remove ExecAgent host logs for task", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) } } @@ -663,20 +721,26 @@ func (engine *DockerTaskEngine) deleteTask(task *apitask.Task) { // ENIs that exist only as logical associations on another interface do not have // attachments that need to be removed. if taskENI.IsStandardENI() { - seelog.Debugf("Task engine [%s]: removing eni %s from agent state", - task.Arn, taskENI.ID) + logger.Debug("Removing ENI from agent state", logger.Fields{ + field.TaskID: task.GetID(), + "eni": taskENI.ID, + }) engine.removeENIAttachmentData(taskENI.MacAddress) engine.state.RemoveENIAttachment(taskENI.MacAddress) } else { - seelog.Debugf("Task engine [%s]: skipping removing logical eni %s from agent state", - task.Arn, taskENI.ID) + logger.Debug("Skipping removing logical ENI from agent state because it's not a standard ENI", logger.Fields{ + field.TaskID: task.GetID(), + "eni": taskENI.ID, + }) } } // Remove task and container data from database. engine.removeTaskData(task) - seelog.Infof("Task engine [%s]: finished removing task data, removing task from managed tasks", task.Arn) + logger.Info("Finished removing task data, removing task from managed tasks", logger.Fields{ + field.TaskID: task.GetID(), + }) delete(engine.managedTasks, task.Arn) engine.tasksLock.Unlock() } @@ -684,11 +748,17 @@ func (engine *DockerTaskEngine) deleteTask(task *apitask.Task) { func (engine *DockerTaskEngine) emitTaskEvent(task *apitask.Task, reason string) { event, err := api.NewTaskStateChangeEvent(task, reason) if err != nil { - seelog.Infof("Task engine [%s]: unable to create task state change event: %v", task.Arn, err) + logger.Error("Unable to create task state change event", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return } - - seelog.Infof("Task engine [%s]: Task engine: sending change event [%s]", task.Arn, event.String()) + logger.Info("Preparing to send change event", logger.Fields{ + field.TaskID: task.GetID(), + field.Status: event.Status.String(), + field.Reason: event.Reason, + }) engine.stateChangeEvents <- event } @@ -745,18 +815,49 @@ func (engine *DockerTaskEngine) handleDockerEvents(ctx context.Context) { // container and placing it in the context of the task to which that container // belongs. func (engine *DockerTaskEngine) handleDockerEvent(event dockerapi.DockerContainerChangeEvent) { - seelog.Debugf("Task engine: handling a docker event: %s", event.String()) + + eventFields := logger.Fields{ + field.DockerId: event.DockerID, + field.Status: event.Status.String(), + "health": event.Health.Status.String(), + } + + if event.ExitCode != nil { + eventFields["exitCode"] = aws.IntValue(event.ExitCode) + } + if len(event.PortBindings) != 0 { + eventFields["portBindings"] = event.PortBindings + } + if event.Error != nil { + eventFields[field.Error] = event.Error + } + if len(event.Volumes) != 0 { + eventFields["volumes"] = event.Volumes + } + if len(event.Labels) != 0 { + eventFields["labels"] = event.Labels + } + if !event.CreatedAt.IsZero() { + eventFields["createdAt"] = event.CreatedAt.String() + } + if !event.StartedAt.IsZero() { + eventFields["startedAt"] = event.StartedAt.String() + } + if !event.FinishedAt.IsZero() { + eventFields["finishedAt"] = event.FinishedAt.String() + } + logger.Debug("Handling a docker event", eventFields) task, ok := engine.state.TaskByID(event.DockerID) if !ok { - seelog.Debugf("Task engine: event for container [%s] not managed, unable to map container id to task", - event.DockerID) + logger.Debug("Unable to map container event to task", eventFields) return } cont, ok := engine.state.ContainerByID(event.DockerID) if !ok { - seelog.Debugf("Task engine: event for container [%s] not managed, unable to map container id to container", - event.DockerID) + logger.Debug("Unable to map container id to container", logger.Fields{ + field.TaskID: task.GetID(), + }, eventFields) return } @@ -764,8 +865,14 @@ func (engine *DockerTaskEngine) handleDockerEvent(event dockerapi.DockerContaine // no need to process this in task manager if event.Type == apicontainer.ContainerHealthEvent { if cont.Container.HealthStatusShouldBeReported() { - seelog.Debugf("Task engine: updating container [%s(%s)] health status: %v", - cont.Container.Name, cont.DockerID, event.DockerContainerMetadata.Health) + logger.Debug("Updating container health status", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Container.Name, + field.DockerId: cont.DockerID, + "status": event.DockerContainerMetadata.Health.Status.String(), + "exitCode": event.DockerContainerMetadata.Health.ExitCode, + "output": event.DockerContainerMetadata.Health.Output, + }) cont.Container.SetHealthStatus(event.DockerContainerMetadata.Health) } return @@ -775,15 +882,18 @@ func (engine *DockerTaskEngine) handleDockerEvent(event dockerapi.DockerContaine managedTask, ok := engine.managedTasks[task.Arn] engine.tasksLock.RUnlock() if !ok { - seelog.Criticalf("Task engine: could not find managed task [%s] corresponding to a docker event: %s", - task.Arn, event.String()) + logger.Critical("Could not find managed task for docker event", logger.Fields{ + field.TaskID: task.GetID(), + }, eventFields) return } - seelog.Debugf("Task engine [%s]: writing docker event to the task: %s", - task.Arn, event.String()) + logger.Debug("Writing docker event to the task", logger.Fields{ + field.TaskID: task.GetID(), + }, eventFields) managedTask.emitDockerContainerChange(dockerContainerChange{container: cont.Container, event: event}) - seelog.Debugf("Task engine [%s]: wrote docker event to the task: %s", - task.Arn, event.String()) + logger.Debug("Wrote docker event to the task", logger.Fields{ + field.TaskID: task.GetID(), + }, eventFields) } // StateChangeEvents returns channels to read task and container state changes. These @@ -799,7 +909,10 @@ func (engine *DockerTaskEngine) AddTask(task *apitask.Task) { err := task.PostUnmarshalTask(engine.cfg, engine.credentialsManager, engine.resourceFields, engine.client, engine.ctx) if err != nil { - seelog.Errorf("Task engine [%s]: unable to add task to the engine: %v", task.Arn, err) + logger.Error("Unable to add task to the engine", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) task.SetKnownStatus(apitaskstatus.TaskStopped) task.SetDesiredStatus(apitaskstatus.TaskStopped) engine.emitTaskEvent(task, err.Error()) @@ -821,7 +934,9 @@ func (engine *DockerTaskEngine) AddTask(task *apitask.Task) { if dependencygraph.ValidDependencies(task, engine.cfg) { engine.startTask(task) } else { - seelog.Errorf("Task engine [%s]: unable to progress task with circular dependencies", task.Arn) + logger.Error("Task has circular dependencies; unable to start", logger.Fields{ + field.TaskID: task.GetID(), + }) task.SetKnownStatus(apitaskstatus.TaskStopped) task.SetDesiredStatus(apitaskstatus.TaskStopped) err := TaskDependencyError{task.Arn} @@ -829,7 +944,6 @@ func (engine *DockerTaskEngine) AddTask(task *apitask.Task) { } return } - // Update task engine.updateTaskUnsafe(existingTask, task) } @@ -851,14 +965,17 @@ func (engine *DockerTaskEngine) pullContainer(task *apitask.Task, container *api return dockerapi.DockerContainerMetadata{} } - if engine.imagePullRequired(engine.cfg.ImagePullBehavior, container, task.Arn) { + if engine.imagePullRequired(engine.cfg.ImagePullBehavior, container, task.GetID()) { // Record the pullStoppedAt timestamp defer func() { timestamp := engine.time().Now() task.SetPullStoppedAt(timestamp) }() - - seelog.Infof("Task engine [%s]: pulling image %s for container %s concurrently", task.Arn, container.Image, container.Name) + logger.Info("Pulling image for container concurrently", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) return engine.concurrentPull(task, container) } @@ -871,7 +988,7 @@ func (engine *DockerTaskEngine) pullContainer(task *apitask.Task, container *api engine.state.AddPulledContainer(dockerContainer, task) // No pull image is required, just update container reference and use cached image. - engine.updateContainerReference(false, container, task.Arn) + engine.updateContainerReference(false, container, task.GetID()) // Return the metadata without any error return dockerapi.DockerContainerMetadata{Error: nil} } @@ -881,7 +998,7 @@ func (engine *DockerTaskEngine) pullContainer(task *apitask.Task, container *api // to make sure the container passed in is not an internal container. func (engine *DockerTaskEngine) imagePullRequired(imagePullBehavior config.ImagePullBehaviorType, container *apicontainer.Container, - taskArn string) bool { + taskId string) bool { switch imagePullBehavior { case config.ImagePullOnceBehavior: // If this image has been pulled successfully before, don't pull the image, @@ -889,8 +1006,11 @@ func (engine *DockerTaskEngine) imagePullRequired(imagePullBehavior config.Image // (the image can be prepopulated with the AMI and never be pulled). imageState, ok := engine.imageManager.GetImageStateFromImageName(container.Image) if ok && imageState.GetPullSucceeded() { - seelog.Infof("Task engine [%s]: image %s for container %s has been pulled once, not pulling it again", - taskArn, container.Image, container.Name) + logger.Info("Image for container already exists, not pulling it again", logger.Fields{ + field.TaskID: taskId, + field.Container: container.Name, + field.Image: container.Image, + }) return false } return true @@ -901,8 +1021,11 @@ func (engine *DockerTaskEngine) imagePullRequired(imagePullBehavior config.Image if err != nil { return true } - seelog.Infof("Task engine [%s]: found cached image %s, use it directly for container %s", - taskArn, container.Image, container.Name) + logger.Info("Found cached image, use it directly for container", logger.Fields{ + field.TaskID: taskId, + field.Container: container.Name, + field.Image: container.Image, + }) return false default: // Need to pull the image for always and default agent pull behavior @@ -911,29 +1034,53 @@ func (engine *DockerTaskEngine) imagePullRequired(imagePullBehavior config.Image } func (engine *DockerTaskEngine) concurrentPull(task *apitask.Task, container *apicontainer.Container) dockerapi.DockerContainerMetadata { - seelog.Debugf("Task engine [%s]: attempting to obtain ImagePullDeleteLock to pull image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Debug("Attempting to obtain ImagePullDeleteLock to pull image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) ImagePullDeleteLock.RLock() - seelog.Debugf("Task engine [%s]: acquired ImagePullDeleteLock, start pulling image %s for container %s", - task.Arn, container.Image, container.Name) - defer seelog.Debugf("Task engine [%s]: released ImagePullDeleteLock after pulling image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Debug("Acquired ImagePullDeleteLock, start pulling image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) + defer logger.Debug("Released ImagePullDeleteLock after pulling image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) defer ImagePullDeleteLock.RUnlock() // Record the task pull_started_at timestamp pullStart := engine.time().Now() ok := task.SetPullStartedAt(pullStart) if ok { - seelog.Infof("Task engine [%s]: recording timestamp for starting image pulltime: %s", - task.Arn, pullStart) + logger.Info("Recording start time for image pull", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + "pullStart": pullStart.String(), + }) + } metadata := engine.pullAndUpdateContainerReference(task, container) if metadata.Error == nil { - seelog.Infof("Task engine [%s]: finished pulling image %s for container %s in %s", - task.Arn, container.Image, container.Name, time.Since(pullStart).String()) + elapsed := time.Since(pullStart) + logger.Info("Finished pulling image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + field.Elapsed: elapsed.String(), + "elapsedMs": elapsed.Milliseconds(), + }) } else { - seelog.Errorf("Task engine [%s]: failed to pull image %s for container %s: %v", - task.Arn, container.Image, container.Name, metadata.Error) + logger.Error("Failed to pull image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + field.Error: metadata.Error, + }) } return metadata } @@ -942,8 +1089,11 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta // If a task is blocked here for some time, and before it starts pulling image, // the task's desired status is set to stopped, then don't pull the image if task.GetDesiredStatus() == apitaskstatus.TaskStopped { - seelog.Infof("Task engine [%s]: task's desired status is stopped, skipping pulling image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Warn("Task's desired status is stopped, skipping image pull for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) container.SetDesiredStatus(apicontainerstatus.ContainerStopped) return dockerapi.DockerContainerMetadata{Error: TaskStoppedBeforePullBeginError{task.Arn}} } @@ -952,8 +1102,11 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta if container.ShouldPullWithExecutionRole() { executionCredentials, ok := engine.credentialsManager.GetTaskCredentials(task.GetExecutionCredentialsID()) if !ok { - seelog.Errorf("Task engine [%s]: unable to acquire ECR credentials for image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Error("Unable to acquire ECR credentials to pull image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) return dockerapi.DockerContainerMetadata{ Error: dockerapi.CannotPullECRContainerError{ FromError: errors.New("engine ecr credentials: not found"), @@ -970,8 +1123,12 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta // Apply registry auth data from ASM if required if container.ShouldPullWithASMAuth() { if err := task.PopulateASMAuthData(container); err != nil { - seelog.Errorf("Task engine [%s]: unable to acquire Docker registry credentials for image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Error("Unable to acquire Docker registry credentials to pull image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + field.Error: err, + }) return dockerapi.DockerContainerMetadata{ Error: dockerapi.CannotPullContainerAuthError{ FromError: errors.New("engine docker private registry credentials: not found"), @@ -996,8 +1153,12 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta // search the image in local cached images if engine.cfg.DependentContainersPullUpfront.Enabled() && engine.cfg.ImagePullBehavior != config.ImagePullAlwaysBehavior { if _, err := engine.client.InspectImage(container.Image); err != nil { - seelog.Errorf("Task engine [%s]: failed to find cached image %s for container %s", - task.Arn, container.Image, container.Name) + logger.Error("Failed to find cached image for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + field.Error: err, + }) // Stop the task if the container is an essential container, // and the image is not available in both remote and local caches if container.IsEssential() { @@ -1006,8 +1167,11 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta } return dockerapi.DockerContainerMetadata{Error: metadata.Error} } - seelog.Infof("Task engine [%s]: found cached image %s, use it directly for container %s", - task.Arn, container.Image, container.Name) + logger.Info("Found cached image, use it directly for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Image: container.Image, + }) findCachedImage = true } } @@ -1019,15 +1183,18 @@ func (engine *DockerTaskEngine) pullAndUpdateContainerReference(task *apitask.Ta engine.state.AddPulledContainer(dockerContainer, task) } - engine.updateContainerReference(pullSucceeded, container, task.Arn) + engine.updateContainerReference(pullSucceeded, container, task.GetID()) return metadata } -func (engine *DockerTaskEngine) updateContainerReference(pullSucceeded bool, container *apicontainer.Container, taskArn string) { +func (engine *DockerTaskEngine) updateContainerReference(pullSucceeded bool, container *apicontainer.Container, taskId string) { err := engine.imageManager.RecordContainerReference(container) if err != nil { - seelog.Errorf("Task engine [%s]: unable to add container reference to image state: %v", - taskArn, err) + logger.Error("Unable to add container reference to image state", logger.Fields{ + field.TaskID: taskId, + field.Container: container.Name, + field.Error: err, + }) } imageState, ok := engine.imageManager.GetImageStateFromImageName(container.Image) if ok && pullSucceeded { @@ -1036,8 +1203,12 @@ func (engine *DockerTaskEngine) updateContainerReference(pullSucceeded bool, con imageState.SetPullSucceeded(true) err = engine.dataClient.SaveImageState(imageState) if err != nil { - seelog.Warnf("Task engine [%s]: unable to save image state: %v", - taskArn, err) + logger.Warn("Unable to save image state", logger.Fields{ + field.TaskID: taskId, + field.Container: container.Name, + field.Image: container.Image, + field.Error: err, + }) } } } @@ -1045,7 +1216,10 @@ func (engine *DockerTaskEngine) updateContainerReference(pullSucceeded bool, con } func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *apicontainer.Container) dockerapi.DockerContainerMetadata { - seelog.Infof("Task engine [%s]: creating container: %s", task.Arn, container.Name) + logger.Info("Creating container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) client := engine.client if container.DockerConfig.Version != nil { client = client.WithVersion(dockerclient.DockerVersion(*container.DockerConfig.Version)) @@ -1121,7 +1295,7 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a } else if container.GetNetworkModeFromHostConfig() == "" || container.GetNetworkModeFromHostConfig() == apitask.BridgeNetworkMode { ipAddress, ok := getContainerHostIP(task.GetFirelensContainer().GetNetworkSettings()) if !ok { - err := apierrors.DockerClientConfigError{Msg: "unable to get BridgeIP for task in bridge mode"} + err := apierrors.DockerClientConfigError{Msg: "unable to get BridgeIP for task in bridge mode"} return dockerapi.DockerContainerMetadata{Error: apierrors.NamedError(&err)} } container.MergeEnvironmentVariables(map[string]string{ @@ -1145,7 +1319,10 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a // Populate credentialspec resource if container.RequiresCredentialSpec() { - seelog.Debugf("Obtained container %s with credentialspec resource requirement for task %s.", container.Name, task.Arn) + logger.Debug("Obtained container with credentialspec resource requirement for task", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) var credSpecResource *credentialspec.CredentialSpecResource resource, ok := task.GetCredentialSpecResource() if !ok || len(resource) <= 0 { @@ -1164,7 +1341,11 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a } // Inject containers' hostConfig.SecurityOpt with the credentialspec resource - seelog.Infof("Injecting container %s with credentialspec %s.", container.Name, desiredCredSpecInjection) + logger.Info("Injecting container with credentialspec", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "credentialSpec": desiredCredSpecInjection, + }) if len(hostConfig.SecurityOpt) == 0 { hostConfig.SecurityOpt = []string{desiredCredSpecInjection} } else { @@ -1184,21 +1365,24 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a if container.ShouldCreateWithEnvFiles() { err := task.MergeEnvVarsFromEnvfiles(container) if err != nil { - seelog.Errorf("Error populating environment variables from specified files into container %s", container.Name) + logger.Error("Error populating environment variables from specified files into container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: err, + }) return dockerapi.DockerContainerMetadata{Error: apierrors.NamedError(err)} } } if execcmd.IsExecEnabledContainer(container) { - tID, err := task.GetID() + tID := task.GetID() + err := engine.execCmdMgr.InitializeContainer(tID, container, hostConfig) if err != nil { - herr := &apierrors.HostConfigError{Msg: err.Error()} - return dockerapi.DockerContainerMetadata{Error: apierrors.NamedError(herr)} - } - err = engine.execCmdMgr.InitializeContainer(tID, container, hostConfig) - if err != nil { - seelog.Warnf("Exec Agent initialization: %v . Continuing to start container without enabling exec feature.", err) - + logger.Warn("Error initializing ExecCommandAgent; proceeding to start container without exec feature", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: err, + }) // Emit a managedagent state chnage event if exec agent initialization fails engine.tasksLock.RLock() mTask, ok := engine.managedTasks[task.Arn] @@ -1206,7 +1390,11 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a if ok { mTask.emitManagedAgentEvent(mTask.Task, container, execcmd.ExecuteCommandAgentName, fmt.Sprintf("ExecuteCommandAgent Initialization failed - %v", err)) } else { - seelog.Errorf("Task engine [%s]: Failed to update status of ExecCommandAgent Process for container [%s]: managed task not found", task.Arn, container.Name) + logger.Error("Failed to update status of ExecCommandAgent Process for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: "managed task not found", + }) } } } @@ -1240,8 +1428,11 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a DockerName: dockerContainerName, Container: container, }, task) - seelog.Infof("Task engine [%s]: created container name mapping for task: %s -> %s", - task.Arn, container.Name, dockerContainerName) + logger.Info("Created container name mapping for task", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "dockerContainerName": dockerContainerName, + }) } // Create metadata directory and file then populate it with common metadata of all containers of this task @@ -1249,13 +1440,19 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a if engine.cfg.ContainerMetadataEnabled.Enabled() && !container.IsInternal() { info, infoErr := engine.client.Info(engine.ctx, dockerclient.InfoTimeout) if infoErr != nil { - seelog.Warnf("Task engine [%s]: unable to get docker info : %v", - task.Arn, infoErr) + logger.Warn("Unable to get docker info", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: infoErr, + }) } mderr := engine.metadataManager.Create(config, hostConfig, task, container.Name, info.SecurityOptions) if mderr != nil { - seelog.Warnf("Task engine [%s]: unable to create metadata for container %s: %v", - task.Arn, container.Name, mderr) + logger.Warn("Unable to create metadata for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: mderr, + }) } } @@ -1263,8 +1460,6 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a metadata := client.CreateContainer(engine.ctx, config, hostConfig, dockerContainerName, engine.cfg.ContainerCreateTimeout) if metadata.DockerID != "" { - seelog.Infof("Task engine [%s]: created docker container for task: %s -> %s", - task.Arn, container.Name, metadata.DockerID) dockerContainer := &apicontainer.DockerContainer{DockerID: metadata.DockerID, DockerName: dockerContainerName, Container: container} @@ -1272,8 +1467,12 @@ func (engine *DockerTaskEngine) createContainer(task *apitask.Task, container *a engine.saveDockerContainerData(dockerContainer) } container.SetLabels(config.Labels) - seelog.Infof("Task engine [%s]: created docker container for task: %s -> %s, took %s", - task.Arn, container.Name, metadata.DockerID, time.Since(createContainerBegin)) + logger.Info("Created docker container for task", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.DockerId: metadata.DockerID, + field.Elapsed: time.Since(createContainerBegin), + }) container.SetRuntimeID(metadata.DockerID) return metadata } @@ -1294,12 +1493,20 @@ func getFirelensLogConfig(task *apitask.Task, container *apicontainer.Container, if bufferLimitExists { logConfig.Config[logDriverBufferLimit] = bufferLimit } - seelog.Debugf("Applying firelens log config for container %s: %v", container.Name, logConfig) + logger.Debug("Applying firelens log config for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "config": logConfig, + }) return logConfig } func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *apicontainer.Container) dockerapi.DockerContainerMetadata { - seelog.Infof("Task engine [%s]: starting container: %s (Runtime ID: %s)", task.Arn, container.Name, container.GetRuntimeID()) + logger.Info("Starting container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + }) client := engine.client if container.DockerConfig.Version != nil { client = client.WithVersion(dockerclient.DockerVersion(*container.DockerConfig.Version)) @@ -1320,8 +1527,12 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap return dockerContainerMD } - seelog.Infof("Task engine [%s]: started docker container for task: %s -> %s, took %s", - task.Arn, container.Name, dockerContainerMD.DockerID, time.Since(startContainerBegin)) + logger.Info("Started container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + field.Elapsed: time.Since(startContainerBegin), + }) // Get metadata through container inspection and available task information then write this to the metadata file // Performs this in the background to avoid delaying container start @@ -1331,13 +1542,18 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap go func() { err := engine.metadataManager.Update(engine.ctx, dockerID, task, container.Name) if err != nil { - seelog.Warnf("Task engine [%s]: failed to update metadata file for container %s: %v", - task.Arn, container.Name, err) + logger.Warn("Failed to update metadata file for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: err, + }) return } container.SetMetadataFileUpdated() - seelog.Debugf("Task engine [%s]: updated metadata file for container %s", - task.Arn, container.Name) + logger.Debug("Updated metadata file for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) }() } @@ -1379,7 +1595,11 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap reason := "ExecuteCommandAgent started" if err := engine.execCmdMgr.StartAgent(engine.ctx, engine.client, task, container, dockerID); err != nil { reason = err.Error() - seelog.Errorf("Task engine [%s]: Failed to start ExecCommandAgent Process for container [%s]: %v", task.Arn, container.Name, err) + logger.Error("Failed to start ExecCommandAgent Process for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: err, + }) } engine.tasksLock.RLock() @@ -1390,7 +1610,11 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap if ok { mTask.emitManagedAgentEvent(mTask.Task, container, execcmd.ExecuteCommandAgentName, reason) } else { - seelog.Errorf("Task engine [%s]: Failed to update status of ExecCommandAgent Process for container [%s]: managed task not found", task.Arn, container.Name) + logger.Error("Failed to update status of ExecCommandAgent Process for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: "managed task not found", + }) } } } @@ -1412,8 +1636,10 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap } func (engine *DockerTaskEngine) provisionContainerResources(task *apitask.Task, container *apicontainer.Container) dockerapi.DockerContainerMetadata { - seelog.Infof("Task engine [%s]: setting up container resources for container [%s]", - task.Arn, container.Name) + logger.Info("Setting up container resources for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) containerInspectOutput, err := engine.inspectContainer(task, container) if err != nil { return dockerapi.DockerContainerMetadata{ @@ -1439,8 +1665,10 @@ func (engine *DockerTaskEngine) provisionContainerResources(task *apitask.Task, // Invoke the libcni to config the network namespace for the container result, err := engine.cniClient.SetupNS(engine.ctx, cniConfig, cniSetupTimeout) if err != nil { - seelog.Errorf("Task engine [%s]: unable to configure pause container namespace: %v", - task.Arn, err) + logger.Error("Unable to configure pause container namespace", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return dockerapi.DockerContainerMetadata{ DockerID: cniConfig.ContainerID, Error: ContainerNetworkingError{errors.Wrap(err, @@ -1450,7 +1678,10 @@ func (engine *DockerTaskEngine) provisionContainerResources(task *apitask.Task, // This is the IP of the task assigned on the bridge for IAM Task roles taskIP := result.IPs[0].Address.IP.String() - seelog.Infof("Task engine [%s]: associated with ip address '%s'", task.Arn, taskIP) + logger.Info("Task associated with ip address", logger.Fields{ + field.TaskID: task.GetID(), + "ip": taskIP, + }) engine.state.AddTaskIPAddress(taskIP, task.Arn) task.SetLocalIPAddress(taskIP) engine.saveTaskData(task) @@ -1458,8 +1689,10 @@ func (engine *DockerTaskEngine) provisionContainerResources(task *apitask.Task, // Invoke additional commands required to configure the task namespace routing. err = engine.namespaceHelper.ConfigureTaskNamespaceRouting(engine.ctx, task.GetPrimaryENI(), cniConfig, result) if err != nil { - seelog.Errorf("Task engine [%s]: unable to configure pause container namespace: %v", - task.Arn, err) + logger.Error("Unable to configure pause container namespace", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) return dockerapi.DockerContainerMetadata{ DockerID: cniConfig.ContainerID, Error: ContainerNetworkingError{errors.Wrapf(err, @@ -1485,7 +1718,10 @@ func (engine *DockerTaskEngine) checkTearDownPauseContainer(task *apitask.Task) if container.KnownTerminal() || container.DesiredTerminal() { err := engine.cleanupPauseContainerNetwork(task, container) if err != nil { - seelog.Errorf("Task engine [%s]: unable to cleanup pause container network namespace: %v", task.Arn, err) + logger.Error("Unable to cleanup pause container network namespace", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) } } return @@ -1501,7 +1737,10 @@ func (engine *DockerTaskEngine) cleanupPauseContainerNetwork(task *apitask.Task, } delay := time.Duration(engine.cfg.ENIPauseContainerCleanupDelaySeconds) * time.Second if engine.handleDelay != nil && delay > 0 { - seelog.Infof("Task engine [%s]: waiting %s before cleaning up pause container.", task.Arn, delay) + logger.Info("Waiting before cleaning up pause container", logger.Fields{ + field.TaskID: task.GetID(), + "wait": delay.String(), + }) engine.handleDelay(delay) } containerInspectOutput, err := engine.inspectContainer(task, container) @@ -1509,7 +1748,9 @@ func (engine *DockerTaskEngine) cleanupPauseContainerNetwork(task *apitask.Task, return errors.Wrap(err, "engine: cannot cleanup task network namespace due to error inspecting pause container") } - seelog.Infof("Task engine [%s]: cleaning up the network namespace", task.Arn) + logger.Info("Cleaning up the network namespace", logger.Fields{ + field.TaskID: task.GetID(), + }) cniConfig, err := engine.buildCNIConfigFromTaskContainer(task, containerInspectOutput, false) if err != nil { return errors.Wrapf(err, @@ -1522,7 +1763,9 @@ func (engine *DockerTaskEngine) cleanupPauseContainerNetwork(task *apitask.Task, } container.SetContainerTornDown(true) - seelog.Infof("Task engine [%s]: cleaned pause container network namespace", task.Arn) + logger.Info("Cleaned pause container network namespace", logger.Fields{ + field.TaskID: task.GetID(), + }) return nil } @@ -1577,8 +1820,10 @@ func (engine *DockerTaskEngine) inspectContainer(task *apitask.Task, container * } func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *apicontainer.Container) dockerapi.DockerContainerMetadata { - seelog.Infof("Task engine [%s]: stopping container [%s]", task.Arn, container.Name) - + logger.Info("Stopping container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) dockerID, err := engine.getDockerID(task, container) if err != nil { return dockerapi.DockerContainerMetadata{ @@ -1592,8 +1837,10 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api if container.Type == apicontainer.ContainerCNIPause { err := engine.cleanupPauseContainerNetwork(task, container) if err != nil { - seelog.Errorf("Task engine [%s]: unable to cleanup pause container network namespace: %v", - task.Arn, err) + logger.Error("Unable to cleanup pause container network namespace", logger.Fields{ + field.TaskID: task.GetID(), + field.Error: err, + }) } } @@ -1637,8 +1884,10 @@ func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName stri } func (engine *DockerTaskEngine) removeContainer(task *apitask.Task, container *apicontainer.Container) error { - seelog.Infof("Task engine [%s]: removing container: %s", task.Arn, container.Name) - + logger.Info("Removing container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) dockerID, err := engine.getDockerID(task, container) if err != nil { return err @@ -1653,8 +1902,9 @@ func (engine *DockerTaskEngine) removeContainer(task *apitask.Task, container *a func (engine *DockerTaskEngine) updateTaskUnsafe(task *apitask.Task, update *apitask.Task) { managedTask, ok := engine.managedTasks[task.Arn] if !ok { - seelog.Criticalf("Task engine [%s]: ACS message for a task we thought we managed, but don't! Aborting.", - task.Arn) + logger.Critical("ACS message for a task we thought we managed, but don't! Aborting.", logger.Fields{ + field.TaskARN: task.Arn, + }) return } // Keep the lock because sequence numbers cannot be correct unless they are @@ -1662,14 +1912,20 @@ func (engine *DockerTaskEngine) updateTaskUnsafe(task *apitask.Task, update *api // This does block the engine's ability to ingest any new events (including // stops for past tasks, ack!), but this is necessary for correctness updateDesiredStatus := update.GetDesiredStatus() - seelog.Debugf("Task engine [%s]: putting update on the acs channel: [%s] with seqnum [%d]", - task.Arn, updateDesiredStatus.String(), update.StopSequenceNumber) + logger.Debug("Putting update on the acs channel", logger.Fields{ + field.TaskID: task.GetID(), + field.DesiredStatus: updateDesiredStatus.String(), + field.Sequence: update.StopSequenceNumber, + }) managedTask.emitACSTransition(acsTransition{ desiredStatus: updateDesiredStatus, seqnum: update.StopSequenceNumber, }) - seelog.Debugf("Task engine [%s]: update taken off the acs channel: [%s] with seqnum [%d]", - task.Arn, updateDesiredStatus.String(), update.StopSequenceNumber) + logger.Debug("Update taken off the acs channel", logger.Fields{ + field.TaskID: task.GetID(), + field.DesiredStatus: updateDesiredStatus.String(), + field.Sequence: update.StopSequenceNumber, + }) } // transitionContainer calls applyContainerState, and then notifies the managed @@ -1699,17 +1955,30 @@ func (engine *DockerTaskEngine) transitionContainer(task *apitask.Task, containe func (engine *DockerTaskEngine) applyContainerState(task *apitask.Task, container *apicontainer.Container, nextState apicontainerstatus.ContainerStatus) dockerapi.DockerContainerMetadata { transitionFunction, ok := engine.transitionFunctionMap()[nextState] if !ok { - seelog.Criticalf("Task engine [%s]: unsupported desired state transition for container [%s]: %s", - task.Arn, container.Name, nextState.String()) + logger.Critical("Unsupported desired state transition for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + "nextState": nextState.String(), + }) return dockerapi.DockerContainerMetadata{Error: &impossibleTransitionError{nextState}} } metadata := transitionFunction(task, container) if metadata.Error != nil { - seelog.Infof("Task engine [%s]: error transitioning container [%s (Runtime ID: %s)] to [%s]: %v", - task.Arn, container.Name, container.GetRuntimeID(), nextState.String(), metadata.Error) + logger.Error("Error transitioning container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + "nextState": nextState.String(), + field.Error: metadata.Error, + }) } else { - seelog.Debugf("Task engine [%s]: transitioned container [%s (Runtime ID: %s)] to [%s]", - task.Arn, container.Name, container.GetRuntimeID(), nextState.String()) + logger.Debug("Transitioned container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + "nextState": nextState.String(), + field.Error: metadata.Error, + }) } return metadata } @@ -1738,12 +2007,17 @@ func (engine *DockerTaskEngine) Version() (string, error) { func (engine *DockerTaskEngine) updateMetadataFile(task *apitask.Task, cont *apicontainer.DockerContainer) { err := engine.metadataManager.Update(engine.ctx, cont.DockerID, task, cont.Container.Name) if err != nil { - seelog.Errorf("Task engine [%s]: failed to update metadata file for container %s: %v", - task.Arn, cont.Container.Name, err) + logger.Error("Failed to update metadata file for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Container.Name, + field.Error: err, + }) } else { cont.Container.SetMetadataFileUpdated() - seelog.Debugf("Task engine [%s]: updated metadata file for container %s", - task.Arn, cont.Container.Name) + logger.Debug("Updated metadata file for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: cont.Container.Name, + }) } } diff --git a/agent/engine/docker_task_engine_linux_test.go b/agent/engine/docker_task_engine_linux_test.go index 120eae89c4e..fb1efc4aded 100644 --- a/agent/engine/docker_task_engine_linux_test.go +++ b/agent/engine/docker_task_engine_linux_test.go @@ -85,8 +85,7 @@ func TestResourceContainerProgression(t *testing.T) { mockControl := mock_control.NewMockControl(ctrl) mockIO := mock_ioutilwrapper.NewMockIOUtil(ctrl) - taskID, err := sleepTask.GetID() - assert.NoError(t, err) + taskID := sleepTask.GetID() cgroupMemoryPath := fmt.Sprintf("/sys/fs/cgroup/memory/ecs/%s/memory.use_hierarchy", taskID) cgroupRoot := fmt.Sprintf("/ecs/%s", taskID) cgroupResource := cgroup.NewCgroupResource(sleepTask.Arn, mockControl, mockIO, cgroupRoot, cgroupMountPath, specs.LinuxResources{}) @@ -94,6 +93,7 @@ func TestResourceContainerProgression(t *testing.T) { sleepTask.ResourcesMapUnsafe = make(map[string][]taskresource.TaskResource) sleepTask.AddResource("cgroup", cgroupResource) eventStream := make(chan dockerapi.DockerContainerChangeEvent) + // containerEventsWG is used to force the test to wait until the container created and started // events are processed containerEventsWG := sync.WaitGroup{} @@ -127,7 +127,6 @@ func TestResourceContainerProgression(t *testing.T) { }() }).Return(dockerapi.DockerContainerMetadata{DockerID: containerID + ":" + sleepContainer.Name}), ) - addTaskToEngine(t, ctx, taskEngine, sleepTask, mockTime, &containerEventsWG) cleanup := make(chan time.Time, 1) @@ -253,8 +252,7 @@ func TestResourceContainerProgressionFailure(t *testing.T) { sleepContainer.BuildResourceDependency("cgroup", resourcestatus.ResourceCreated, apicontainerstatus.ContainerPulled) mockControl := mock_control.NewMockControl(ctrl) - taskID, err := sleepTask.GetID() - assert.NoError(t, err) + taskID := sleepTask.GetID() cgroupRoot := fmt.Sprintf("/ecs/%s", taskID) cgroupResource := cgroup.NewCgroupResource(sleepTask.Arn, mockControl, nil, cgroupRoot, cgroupMountPath, specs.LinuxResources{}) @@ -269,7 +267,7 @@ func TestResourceContainerProgressionFailure(t *testing.T) { ) mockTime.EXPECT().Now().Return(time.Now()).AnyTimes() - err = taskEngine.Init(ctx) + err := taskEngine.Init(ctx) assert.NoError(t, err) taskEngine.AddTask(sleepTask) @@ -330,8 +328,7 @@ func TestTaskCPULimitHappyPath(t *testing.T) { }() mockControl := mock_control.NewMockControl(ctrl) mockIO := mock_ioutilwrapper.NewMockIOUtil(ctrl) - taskID, err := sleepTask.GetID() - assert.NoError(t, err) + taskID := sleepTask.GetID() cgroupMemoryPath := fmt.Sprintf("/sys/fs/cgroup/memory/ecs/%s/memory.use_hierarchy", taskID) if tc.taskCPULimit.Enabled() { // TODO Currently, the resource Setup() method gets invoked multiple diff --git a/agent/engine/docker_task_engine_windows.go b/agent/engine/docker_task_engine_windows.go index 1e64df89271..a5f80f9a91b 100644 --- a/agent/engine/docker_task_engine_windows.go +++ b/agent/engine/docker_task_engine_windows.go @@ -18,9 +18,11 @@ package engine import ( "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" apitask "github.com/aws/amazon-ecs-agent/agent/api/task" - "github.com/cihub/seelog" "github.com/pkg/errors" ) @@ -52,7 +54,11 @@ func (engine *DockerTaskEngine) invokePluginsForContainer(task *apitask.Task, co // Invoke the cni plugin for the container using libcni _, err = engine.cniClient.SetupNS(engine.ctx, cniConfig, cniSetupTimeout) if err != nil { - seelog.Errorf("Task engine [%s]: unable to configure container %v in the pause namespace: %v", task.Arn, container.Name, err) + logger.Error("Unable to configure container in the pause namespace", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.Error: err, + }) return errors.Wrap(err, "failed to connect HNS endpoint to container") } diff --git a/agent/engine/engine_sudo_linux_integ_test.go b/agent/engine/engine_sudo_linux_integ_test.go index 2eec610d792..e439b783cf5 100644 --- a/agent/engine/engine_sudo_linux_integ_test.go +++ b/agent/engine/engine_sudo_linux_integ_test.go @@ -247,7 +247,7 @@ func TestFirelensFluentbit(t *testing.T) { err = VerifyTaskStatus(apitaskstatus.TaskStopped, testTask.Arn, testEvents, t) assert.NoError(t, err) - taskID, err := testTask.GetID() + taskID := testTask.GetID() //declare a cloudwatch client cwlClient := cloudwatchlogs.New(session.New(), aws.NewConfig().WithRegion(testECSRegion)) diff --git a/agent/engine/execcmd/manager_start.go b/agent/engine/execcmd/manager_start.go index b1a46d18684..f9d84ff9718 100644 --- a/agent/engine/execcmd/manager_start.go +++ b/agent/engine/execcmd/manager_start.go @@ -23,7 +23,9 @@ import ( "strconv" "time" - "github.com/cihub/seelog" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/docker/docker/api/types" apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" @@ -42,7 +44,10 @@ import ( // To actually restart the ExecCommandAgent, this function invokes this instance's StartAgent method. func (m *manager) RestartAgentIfStopped(ctx context.Context, client dockerapi.DockerClient, task *apitask.Task, container *apicontainer.Container, containerId string) (RestartStatus, error) { if !IsExecEnabledContainer(container) { - seelog.Warnf("Task engine [%s]: an attempt to restart ExecCommandAgent for a non ExecCommandAgent-enabled container was made; container %s", task.Arn, containerId) + logger.Warn("Attempt to restart ExecCommandAgent for a non ExecCommandAgent-enabled container was made", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + }) return NotRestarted, nil } ma, _ := container.GetManagedAgentByName(ExecuteCommandAgentName) @@ -60,7 +65,12 @@ func (m *manager) RestartAgentIfStopped(ctx context.Context, client dockerapi.Do } // Restart if not running //TODO: [ecs-exec] retry only for certain exit codes? - seelog.Warnf("Task engine [%s]: ExecCommandAgent Process stopped (exitCode=%d) for container %s, restarting...", task.Arn, res.ExitCode, containerId) + logger.Warn("ExecCommandAgent Process stopped for container, restarting...", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: containerId, + "exitCode": res.ExitCode, + }) container.UpdateManagedAgentByName(ExecuteCommandAgentName, apicontainer.ManagedAgentState{ ID: ma.ID, }) @@ -100,7 +110,11 @@ func (m *manager) inspectExecAgentProcess(ctx context.Context, client dockerapi. // If no error is returned, it can be assumed the ExecCommandAgent is started. func (m *manager) StartAgent(ctx context.Context, client dockerapi.DockerClient, task *apitask.Task, container *apicontainer.Container, containerId string) error { if !IsExecEnabledContainer(container) { - seelog.Warnf("Task engine [%s]: an attempt to start ExecCommandAgent for a non ExecCommandAgent-enabled container was made; container %s", task.Arn, containerId) + logger.Warn("An attempt to start ExecCommandAgent for a non ExecCommandAgent-enabled container was made", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: containerId, + }) return nil } ma, _ := container.GetManagedAgentByName(ExecuteCommandAgentName) @@ -112,9 +126,18 @@ func (m *manager) StartAgent(ctx context.Context, client dockerapi.DockerClient, if m.isAgentStarted(ma) { res, err := m.inspectExecAgentProcess(ctx, client, existingMetadata) if err != nil { - seelog.Warnf("Task engine [%s]: could not verify if the ExecCommandAgent was already running for container %s: %v", task.Arn, containerId, err) + logger.Warn("Could not verify if the ExecCommandAgent was already running for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: containerId, + field.Error: err, + }) } else if res.Running { // agent is already running, nothing to do - seelog.Warnf("Task engine [%s]: an attempt was made to start the ExecCommandAgent but it was already running (%s)", task.Arn, containerId) + logger.Warn("An attempt was made to start the ExecCommandAgent but it was already running", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: containerId, + }) return nil } } @@ -128,7 +151,12 @@ func (m *manager) StartAgent(ctx context.Context, client dockerapi.DockerClient, retry.RetryNWithBackoffCtx(ctx, backoff, maxRetries, func() error { execMD, startErr = m.doStartAgent(ctx, client, task, ma, containerId) if startErr != nil { - seelog.Warnf("Task engine [%s]: exec command agent failed to start for container %s: %v. Retrying...", task.Arn, containerId, startErr) + logger.Warn("Exec command agent failed to start for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: container.Name, + field.RuntimeID: containerId, + field.Error: startErr, + }) } return startErr }) @@ -163,20 +191,33 @@ func (m *manager) doStartAgent(ctx context.Context, client dockerapi.DockerClien return newMD, StartError{error: fmt.Errorf("unable to start ExecuteCommandAgent [create]: %v", err), retryable: true} } - seelog.Debugf("Task engine [%s]: created ExecCommandAgent for container: %s -> docker exec id: %s", task.Arn, containerId, execRes.ID) + logger.Debug("Created ExecCommandAgent for container", logger.Fields{ + field.TaskID: task.GetID(), + field.RuntimeID: containerId, + "execResId": execRes.ID, + }) err = client.StartContainerExec(ctx, execRes.ID, types.ExecStartCheck{Detach: true, Tty: false}, dockerclient.ContainerExecStartTimeout) if err != nil { return newMD, StartError{error: fmt.Errorf("unable to start ExecuteCommandAgent [pre-start]: %v", err), retryable: true} } - seelog.Debugf("Task engine [%s]: sent ExecCommandAgent start signal for container: %s -> docker exec id: %s", task.Arn, containerId, execRes.ID) + logger.Debug("Sent ExecCommandAgent start signal for container", logger.Fields{ + field.TaskID: task.GetID(), + field.RuntimeID: containerId, + "execResId": execRes.ID, + }) inspect, err := client.InspectContainerExec(ctx, execRes.ID, dockerclient.ContainerExecInspectTimeout) if err != nil { return newMD, StartError{error: fmt.Errorf("unable to start ExecuteCommandAgent [inspect]: %v", err), retryable: true} } - seelog.Debugf("Task engine [%s]: inspect ExecCommandAgent for container: %s -> pid: %d, exitCode: %d, running:%v, err:%v", - task.Arn, containerId, inspect.Pid, inspect.ExitCode, inspect.Running, err) + logger.Debug("Inspect ExecCommandAgent for container", logger.Fields{ + field.TaskID: task.GetID(), + field.RuntimeID: containerId, + "pid": inspect.Pid, + "exitCode": inspect.ExitCode, + "running": inspect.Running, + }) if !inspect.Running { //TODO: [ecs-exec] retry only for certain exit codes? return newMD, StartError{ @@ -184,7 +225,11 @@ func (m *manager) doStartAgent(ctx context.Context, client dockerapi.DockerClien retryable: true, } } - seelog.Infof("Task engine [%s]: started ExecCommandAgent for container: %s -> docker exec id: %s", task.Arn, containerId, execRes.ID) + logger.Info("Started ExecCommandAgent for container", logger.Fields{ + field.TaskID: task.GetID(), + field.RuntimeID: containerId, + "execResId": inspect.Pid, + }) newMD.PID = strconv.Itoa(inspect.Pid) newMD.DockerExecID = execRes.ID newMD.CMD = execAgentCmd diff --git a/agent/engine/image/types.go b/agent/engine/image/types.go index acdb92fc71b..d16438ab363 100644 --- a/agent/engine/image/types.go +++ b/agent/engine/image/types.go @@ -20,8 +20,10 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" - "github.com/cihub/seelog" ) type Image struct { @@ -55,7 +57,11 @@ type ImageState struct { func (imageState *ImageState) UpdateContainerReference(container *apicontainer.Container) { imageState.lock.Lock() defer imageState.lock.Unlock() - seelog.Infof("Updating container reference %v in Image State - %v", container.Name, imageState.Image.ImageID) + logger.Info("Updating container reference in image state", logger.Fields{ + field.Container: container.Name, + field.Image: container.Image, + "state": imageState.Image.ImageID, + }) imageState.Containers = append(imageState.Containers, container) } @@ -64,7 +70,10 @@ func (imageState *ImageState) AddImageName(imageName string) { imageState.lock.Lock() defer imageState.lock.Unlock() if !imageState.HasImageName(imageName) { - seelog.Infof("Adding image name- %v to Image state- %v", imageName, imageState.Image.ImageID) + logger.Info("Adding image to state", logger.Fields{ + field.Image: imageName, + "state": imageState.Image.ImageID, + }) imageState.Image.Names = append(imageState.Image.Names, imageName) } } @@ -125,7 +134,11 @@ func (imageState *ImageState) RemoveContainerReference(container *apicontainer.C for i := range imageState.Containers { if imageState.Containers[i].Name == container.Name { // Container reference found; hence remove it - seelog.Infof("Removing Container Reference: %v from Image State- %v", container.Name, imageState.Image.ImageID) + logger.Info("Removing container reference from image state", logger.Fields{ + field.Container: container.Name, + field.Image: container.Image, + "state": imageState.Image.ImageID, + }) imageState.Containers = append(imageState.Containers[:i], imageState.Containers[i+1:]...) // Update the last used time for the image imageState.LastUsedAt = time.Now() diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index bba3b40aca9..a0ea9274b96 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -224,7 +224,7 @@ func (mtask *managedTask) overseeTask() { // If we aren't terminal and we aren't steady state, we should be // able to move some containers along. logger.Debug("Task not steady state or terminal; progressing it", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.progressTask() @@ -237,13 +237,13 @@ func (mtask *managedTask) overseeTask() { // We only break out of the above if this task is known to be stopped. Do // onetime cleanup here, including removing the task after a timeout logger.Info("Managed task has reached stopped; waiting for container cleanup", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.engine.checkTearDownPauseContainer(mtask.Task) mtask.cleanupCredentials() if mtask.StopSequenceNumber != 0 { logger.Debug("Marking done for this sequence", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Sequence: mtask.StopSequenceNumber, }) mtask.taskStopWG.Done(mtask.StopSequenceNumber) @@ -286,7 +286,7 @@ func (mtask *managedTask) waitForHostResources() { } logger.Info("Waiting for any previous stops to complete", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Sequence: mtask.StartSequenceNumber, }) @@ -307,7 +307,7 @@ func (mtask *managedTask) waitForHostResources() { } } logger.Info("Wait over; ready to move towards desired status", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.DesiredStatus: mtask.GetDesiredStatus().String(), }) } @@ -316,7 +316,7 @@ func (mtask *managedTask) waitForHostResources() { // event, or a timeout. func (mtask *managedTask) waitSteady() { logger.Info("Managed task at steady state", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.KnownStatus: mtask.GetKnownStatus().String(), }) @@ -329,7 +329,7 @@ func (mtask *managedTask) waitSteady() { if timedOut { logger.Debug("Checking to verify it's still at steady state", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) go mtask.engine.checkTaskState(mtask.Task) } @@ -341,7 +341,7 @@ func (mtask *managedTask) steadyState() bool { select { case <-mtask.ctx.Done(): logger.Info("Task manager exiting", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) return false default: @@ -365,12 +365,14 @@ func (mtask *managedTask) cleanupCredentials() { // return true. func (mtask *managedTask) waitEvent(stopWaiting <-chan struct{}) bool { logger.Debug("Waiting for task event", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) select { case acsTransition := <-mtask.acsMessages: logger.Info("Managed task got acs event", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), + field.DesiredStatus: acsTransition.desiredStatus.String(), + field.Sequence: acsTransition.seqnum, }) mtask.handleDesiredStatusChange(acsTransition.desiredStatus, acsTransition.seqnum) return false @@ -380,7 +382,7 @@ func (mtask *managedTask) waitEvent(stopWaiting <-chan struct{}) bool { case resChange := <-mtask.resourceStateChangeEvent: res := resChange.resource logger.Info("Managed task got resource", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: res.GetName(), field.Status: res.StatusString(resChange.nextState), }) @@ -399,14 +401,14 @@ func (mtask *managedTask) handleDesiredStatusChange(desiredStatus apitaskstatus. // Handle acs message changes this task's desired status to whatever // acs says it should be if it is compatible logger.Info("New acs transition", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.DesiredStatus: desiredStatus.String(), field.Sequence: seqnum, "StopNumber": mtask.StopSequenceNumber, }) if desiredStatus <= mtask.GetDesiredStatus() { - logger.Info("Redundant task transition; ignoring", logger.Fields{ - field.TaskARN: mtask.Arn, + logger.Debug("Redundant task transition; ignoring", logger.Fields{ + field.TaskID: mtask.GetID(), field.DesiredStatus: desiredStatus.String(), field.Sequence: seqnum, "StopNumber": mtask.StopSequenceNumber, @@ -416,7 +418,7 @@ func (mtask *managedTask) handleDesiredStatusChange(desiredStatus apitaskstatus. if desiredStatus == apitaskstatus.TaskStopped && seqnum != 0 && mtask.GetStopSequenceNumber() == 0 { logger.Info("Managed task moving to stopped, adding to stopgroup with sequence number", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Sequence: seqnum, }) mtask.SetStopSequenceNumber(seqnum) @@ -436,22 +438,23 @@ func (mtask *managedTask) handleContainerChange(containerChange dockerContainerC runtimeID := container.GetRuntimeID() event := containerChange.event containerKnownStatus := container.GetKnownStatus() - if event.Status != containerKnownStatus { - logger.Info("Handling container change event", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: runtimeID, - field.Status: event.Status.String(), - }) + + eventLogFields := logger.Fields{ + field.TaskID: mtask.GetID(), + field.Container: container.Name, + field.RuntimeID: runtimeID, + field.Status: event.Status.String(), + } + + if event.Status.String() == apicontainerstatus.ContainerStatusNone.String() || + (event.Status == apicontainerstatus.ContainerRunning && containerKnownStatus == apicontainerstatus.ContainerResourcesProvisioned) { + logger.Debug("Handling container change event", eventLogFields) + } else if event.Status != containerKnownStatus { // This prevents noisy docker event logs for pause container + logger.Info("Handling container change event", eventLogFields) } found := mtask.isContainerFound(container) if !found { - logger.Critical("State error; invoked with another task's container!", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: runtimeID, - field.Status: event.Status.String(), - }) + logger.Critical("State error; invoked with another task's container!", eventLogFields) return } @@ -459,11 +462,7 @@ func (mtask *managedTask) handleContainerChange(containerChange dockerContainerC // to be known running so it will be stopped. Subsequently ignore these backward transitions mtask.handleStoppedToRunningContainerTransition(event.Status, container) if event.Status <= containerKnownStatus { - logger.Debug("Container change is redundant", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: runtimeID, - field.Status: event.Status.String(), + logger.Debug("Container change is redundant", eventLogFields, logger.Fields{ field.KnownStatus: containerKnownStatus.String(), }) @@ -495,20 +494,13 @@ func (mtask *managedTask) handleContainerChange(containerChange dockerContainerC } mtask.RecordExecutionStoppedAt(container) - logger.Debug("Sending container change event to tcs", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: runtimeID, - field.Status: event.Status.String(), - }) + logger.Debug("Sending container change event to tcs", eventLogFields) err := mtask.containerChangeEventStream.WriteToEventStream(event) if err != nil { logger.Warn("Failed to write container change event to tcs event stream", + eventLogFields, logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: runtimeID, - field.Error: err, + field.Error: err, }) } @@ -533,7 +525,7 @@ func (mtask *managedTask) handleResourceStateChange(resChange resourceStateChang if !mtask.isResourceFound(res) { logger.Critical("State error; invoked with another task's resource", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: res.GetName(), }) return @@ -545,7 +537,7 @@ func (mtask *managedTask) handleResourceStateChange(resChange resourceStateChang if status <= currentKnownStatus { logger.Info("Redundant resource state change", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: res.GetName(), field.Status: res.StatusString(status), field.KnownStatus: res.StatusString(currentKnownStatus), @@ -566,15 +558,15 @@ func (mtask *managedTask) handleResourceStateChange(resChange resourceStateChang } if status == res.SteadyState() { // Failed to create resource. - logger.Error("Managed task [%s]: failed to create task resource [%s]: %v", logger.Fields{ - field.TaskARN: mtask.Arn, + logger.Error("Failed to create task resource", logger.Fields{ + field.TaskID: mtask.GetID(), field.Resource: res.GetName(), field.Error: err, }) res.SetKnownStatus(currentKnownStatus) // Set status back to None. logger.Info("Marking task desired status to STOPPED", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.SetDesiredStatus(apitaskstatus.TaskStopped) mtask.Task.SetTerminalReason(res.GetTerminalReason()) @@ -585,7 +577,7 @@ func (mtask *managedTask) emitResourceChange(change resourceStateChange) { select { case <-mtask.ctx.Done(): logger.Info("Unable to emit resource state change due to exit", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) case mtask.resourceStateChangeEvent <- change: } @@ -611,17 +603,25 @@ func getContainerEventLogFields(c api.ContainerStateChange) logger.Fields { } func (mtask *managedTask) emitTaskEvent(task *apitask.Task, reason string) { + taskKnownStatus := task.GetKnownStatus() + if !taskKnownStatus.BackendRecognized() { + logger.Debug("Skipping event emission for task", logger.Fields{ + field.TaskID: mtask.GetID(), + field.Error: "status not recognized by ECS", + }) + return + } event, err := api.NewTaskStateChangeEvent(task, reason) if err != nil { - logger.Debug("Managed task [%s]: skipping emitting event for task [%s]: %v", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Reason: reason, - field.Error: err, + logger.Error("Skipping emitting event for task due to error", logger.Fields{ + field.TaskID: mtask.GetID(), + field.Reason: reason, + field.Error: err, }) return } logger.Debug("Sending task change event", logger.Fields{ - field.TaskARN: task.Arn, + field.TaskID: mtask.GetID(), field.Status: event.Status.String(), field.SentStatus: task.GetSentStatus().String(), field.Event: event.String(), @@ -629,7 +629,7 @@ func (mtask *managedTask) emitTaskEvent(task *apitask.Task, reason string) { select { case <-mtask.ctx.Done(): logger.Info("Unable to send task change event due to exit", logger.Fields{ - field.TaskARN: task.Arn, + field.TaskID: mtask.GetID(), field.Status: event.Status.String(), field.SentStatus: task.GetSentStatus().String(), field.Event: event.String(), @@ -637,8 +637,8 @@ func (mtask *managedTask) emitTaskEvent(task *apitask.Task, reason string) { case mtask.stateChangeEvents <- event: } logger.Debug("Sent task change event", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Event: event.String(), + field.TaskID: mtask.GetID(), + field.Event: event.String(), }) } @@ -649,27 +649,27 @@ func (mtask *managedTask) emitManagedAgentEvent(task *apitask.Task, cont *apicon event, err := api.NewManagedAgentChangeEvent(task, cont, managedAgentName, reason) if err != nil { logger.Error("Skipping emitting ManagedAgent event for task", logger.Fields{ - field.TaskARN: task.Arn, - field.Reason: reason, - field.Error: err, + field.TaskID: mtask.GetID(), + field.Reason: reason, + field.Error: err, }) return } logger.Info("Sending ManagedAgent event", logger.Fields{ - field.TaskARN: task.Arn, - field.Event: event.String(), + field.TaskID: mtask.GetID(), + field.Event: event.String(), }) select { case <-mtask.ctx.Done(): logger.Info("Unable to send managed agent event due to exit", logger.Fields{ - field.TaskARN: task.Arn, - field.Event: event.String(), + field.TaskID: mtask.GetID(), + field.Event: event.String(), }) case mtask.stateChangeEvents <- event: } logger.Info("Sent managed agent event [%s]", logger.Fields{ - field.TaskARN: task.Arn, - field.Event: event.String(), + field.TaskID: mtask.GetID(), + field.Event: event.String(), }) } @@ -679,7 +679,7 @@ func (mtask *managedTask) emitContainerEvent(task *apitask.Task, cont *apicontai event, err := api.NewContainerStateChangeEvent(task, cont, reason) if err != nil { logger.Debug("Skipping emitting event for container", logger.Fields{ - field.TaskARN: task.Arn, + field.TaskID: mtask.GetID(), field.Container: cont.Name, field.Error: err, }) @@ -690,17 +690,17 @@ func (mtask *managedTask) emitContainerEvent(task *apitask.Task, cont *apicontai func (mtask *managedTask) doEmitContainerEvent(event api.ContainerStateChange) { logger.Debug("Sending container change event", getContainerEventLogFields(event), logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) select { case <-mtask.ctx.Done(): logger.Info("Unable to send container change event due to exit", getContainerEventLogFields(event), logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) case mtask.stateChangeEvents <- event: } logger.Debug("Sent container change event", getContainerEventLogFields(event), logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) } @@ -708,7 +708,7 @@ func (mtask *managedTask) emitDockerContainerChange(change dockerContainerChange select { case <-mtask.ctx.Done(): logger.Info("Unable to emit docker container change due to exit", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) case mtask.dockerMessages <- change: } @@ -718,7 +718,7 @@ func (mtask *managedTask) emitACSTransition(transition acsTransition) { select { case <-mtask.ctx.Done(): logger.Info("Unable to emit docker container change due to exit", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) case mtask.acsMessages <- transition: } @@ -750,7 +750,7 @@ func (mtask *managedTask) releaseIPInIPAM() { return } logger.Info("IPAM releasing ip for task eni", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) cfg, err := mtask.BuildCNIConfig(true, &ecscni.Config{ @@ -758,16 +758,16 @@ func (mtask *managedTask) releaseIPInIPAM() { }) if err != nil { logger.Error("Failed to release ip; unable to build cni configuration", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Error: err, + field.TaskID: mtask.GetID(), + field.Error: err, }) return } err = mtask.cniClient.ReleaseIPResource(mtask.ctx, cfg, ipamCleanupTmeout) if err != nil { logger.Error("Failed to release ip; IPAM error", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Error: err, + field.TaskID: mtask.GetID(), + field.Error: err, }) return } @@ -797,7 +797,7 @@ func (mtask *managedTask) handleStoppedToRunningContainerTransition(status apico // update it to 'known running' so that it will be driven back to stopped mtask.unexpectedStart.Do(func() { logger.Warn("Stopped container came back; re-stopping it once", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, }) go mtask.engine.transitionContainer(mtask.Task, container, apicontainerstatus.ContainerStopped) @@ -813,7 +813,7 @@ func (mtask *managedTask) handleManagedAgentStoppedTransition(container *apicont case execcmd.ExecuteCommandAgentName: if !container.UpdateManagedAgentStatus(managedAgentName, apicontainerstatus.ManagedAgentStopped) { logger.Warn("Cannot find ManagedAgent for container", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.ManagedAgent: managedAgentName, }) @@ -822,7 +822,7 @@ func (mtask *managedTask) handleManagedAgentStoppedTransition(container *apicont mtask.emitManagedAgentEvent(mtask.Task, container, managedAgentName, "Received Container Stopped event") default: logger.Warn("Unexpected ManagedAgent in container; unable to process ManagedAgent transition event", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.ManagedAgent: managedAgentName, }) @@ -848,7 +848,7 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange if mtask.cfg.ImagePullBehavior == config.ImagePullAlwaysBehavior || mtask.cfg.ImagePullBehavior == config.ImagePullOnceBehavior { logger.Error("Error while pulling image; moving task to STOPPED", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Image: container.Image, field.Container: container.Name, field.Error: event.Error, @@ -864,7 +864,7 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange // If the agent pull behavior is default, use local image cache directly, // assuming it exists. logger.Error("Error while pulling image; will try to run anyway", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Image: container.Image, field.Container: container.Name, field.Error: event.Error, @@ -879,7 +879,7 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange case apicontainerstatus.ContainerCreated: // No need to explicitly stop containers if this is a * -> NONE/CREATED transition logger.Warn("Error creating container; marking its desired status as STOPPED", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.Error: event.Error, }) @@ -890,7 +890,7 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange // If this is a * -> RUNNING / RESOURCES_PROVISIONED transition, we need to stop // the container. logger.Warn("Error starting/provisioning container[%s (Runtime ID: %s)];", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), field.Error: event.Error, @@ -913,7 +913,7 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange if shouldForceStop { logger.Warn("Forcing container to stop", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), }) @@ -934,7 +934,7 @@ func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi. pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout) if pr.Error != nil { logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), "ErrorName": event.Error.ErrorName(), @@ -952,7 +952,7 @@ func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi. // This can happen in cases where the container we tried to stop // was already stopped or did not exist at all. logger.Warn("Error stopping the container; marking it as stopped anyway", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), "ErrorName": event.Error.ErrorName(), @@ -971,7 +971,7 @@ func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi. // Container changes may also prompt the task status to change as well. func (mtask *managedTask) progressTask() { logger.Debug("Progressing containers and resources in task", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) // max number of transitions length to ensure writes will never block on // these and if we exit early transitions can exit the goroutine and it'll @@ -1042,7 +1042,7 @@ func (mtask *managedTask) progressTask() { // update the task status if mtask.UpdateStatus() { logger.Info("Container or resource change also resulted in task change", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) // If knownStatus changed, let it be known @@ -1060,7 +1060,7 @@ func (mtask *managedTask) isWaitingForACSExecutionCredentials(reasons []error) b for _, reason := range reasons { if reason == dependencygraph.CredentialsNotResolvedErr { logger.Info("Waiting for credentials to pull from ECR", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) timeoutCtx, timeoutCancel := context.WithTimeout(mtask.ctx, waitForPullCredentialsTimeout) @@ -1069,7 +1069,7 @@ func (mtask *managedTask) isWaitingForACSExecutionCredentials(reasons []error) b timedOut := mtask.waitEvent(timeoutCtx.Done()) if timedOut { logger.Info("Timed out waiting for acs credentials message", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) } return true @@ -1139,7 +1139,7 @@ func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransi if knownStatus >= desiredStatus { logger.Debug("Resource has already transitioned to or beyond the desired status", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: res.GetName(), field.KnownStatus: res.StatusString(knownStatus), field.DesiredStatus: res.StatusString(desiredStatus), @@ -1194,7 +1194,7 @@ func (mtask *managedTask) applyResourceState(resource taskresource.TaskResource, err := resource.ApplyTransition(nextState) if err != nil { logger.Info("Error transitioning resource", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: resName, field.FailedStatus: resStatus, field.Error: err, @@ -1203,9 +1203,9 @@ func (mtask *managedTask) applyResourceState(resource taskresource.TaskResource, return err } logger.Info("Transitioned resource", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Resource: resName, - field.FailedStatus: resStatus, + field.TaskID: mtask.GetID(), + field.Resource: resName, + field.Status: resStatus, }) return nil } @@ -1233,7 +1233,7 @@ func (mtask *managedTask) containerNextState(container *apicontainer.Container) if containerKnownStatus == containerDesiredStatus { logger.Debug("Container at desired status", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), field.DesiredStatus: containerDesiredStatus.String(), @@ -1246,8 +1246,8 @@ func (mtask *managedTask) containerNextState(container *apicontainer.Container) } if containerKnownStatus > containerDesiredStatus { - logger.Debug("Managed task [%s]: container [%s (Runtime ID: %s)] has already transitioned beyond desired status(%s): %s", logger.Fields{ - field.TaskARN: mtask.Arn, + logger.Debug("Container has already transitioned beyond desired status", logger.Fields{ + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), field.KnownStatus: containerKnownStatus.String(), @@ -1262,7 +1262,7 @@ func (mtask *managedTask) containerNextState(container *apicontainer.Container) if blocked, err := dependencygraph.DependenciesAreResolved(container, mtask.Containers, mtask.Task.GetExecutionCredentialsID(), mtask.credentialsManager, mtask.GetResources(), mtask.cfg); err != nil { logger.Debug("Can't apply state to container yet due to unresolved dependencies", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Container: container.Name, field.RuntimeID: container.GetRuntimeID(), field.Error: err, @@ -1324,8 +1324,8 @@ func (mtask *managedTask) resourceNextState(resource taskresource.TaskResource) resDesiredStatus := resource.GetDesiredStatus() if resKnownStatus >= resDesiredStatus { - logger.Debug("Managed task [%s]: task resource [%s] has already transitioned to or beyond desired status(%s): %s", logger.Fields{ - field.TaskARN: mtask.Arn, + logger.Debug("Task resource has already transitioned to or beyond desired status", logger.Fields{ + field.TaskID: mtask.GetID(), field.Resource: resource.GetName(), field.KnownStatus: resource.StatusString(resKnownStatus), field.DesiredStatus: resource.StatusString(resDesiredStatus), @@ -1340,7 +1340,7 @@ func (mtask *managedTask) resourceNextState(resource taskresource.TaskResource) if err := dependencygraph.TaskResourceDependenciesAreResolved(resource, mtask.Containers); err != nil { logger.Debug("Can't apply state to resource yet due to unresolved dependencies", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.Resource: resource.GetName(), field.Error: err, }) @@ -1371,21 +1371,21 @@ func (mtask *managedTask) resourceNextState(resource taskresource.TaskResource) func (mtask *managedTask) handleContainersUnableToTransitionState() { logger.Critical("Task in a bad state; it's not steady state but no containers want to transition", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) if mtask.GetDesiredStatus().Terminal() { // Ack, really bad. We want it to stop but the containers don't think // that's possible. let's just break out and hope for the best! logger.Critical("The state is so bad that we're just giving up on it", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.SetKnownStatus(apitaskstatus.TaskStopped) mtask.emitTaskEvent(mtask.Task, taskUnableToTransitionToStoppedReason) // TODO we should probably panic here } else { logger.Critical("Moving task to stopped due to bad state", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.handleDesiredStatusChange(apitaskstatus.TaskStopped, 0) } @@ -1399,18 +1399,18 @@ func (mtask *managedTask) waitForTransition(transitions map[string]string, // progressTask call. This is done by waiting for one transition/acs/docker message. if !mtask.waitEvent(transition) { logger.Debug("Received non-transition events", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) return } transitionedEntity := <-transitionChangeEntity - logger.Debug("Managed task [%s]: transition for [%s] finished", logger.Fields{ - field.TaskARN: mtask.Arn, + logger.Debug("Transition finished", logger.Fields{ + field.TaskID: mtask.GetID(), "TransitionedEntity": transitionedEntity, }) delete(transitions, transitionedEntity) logger.Debug("Task still waiting for: %v", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), "TransitionedEntity": fmt.Sprintf("%v", transitions), }) } @@ -1430,7 +1430,7 @@ func (mtask *managedTask) cleanupTask(taskStoppedDuration time.Duration) { cleanupTime := make(<-chan time.Time) if cleanupTimeDuration < 0 { logger.Info("Cleanup Duration has been exceeded; starting cleanup now", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) cleanupTime = mtask.time().After(time.Nanosecond) } else { @@ -1449,14 +1449,14 @@ func (mtask *managedTask) cleanupTask(taskStoppedDuration time.Duration) { ok := mtask.waitForStopReported() if !ok { logger.Error("Aborting cleanup for task as it is not reported as stopped", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.SentStatus: mtask.GetSentStatus().String(), }) return } logger.Info("Cleaning up task's containers and data", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) // For the duration of this, simply discard any task events; this ensures the @@ -1469,7 +1469,7 @@ func (mtask *managedTask) cleanupTask(taskStoppedDuration time.Duration) { // Remove TaskExecutionCredentials from credentialsManager if taskExecutionCredentialsID != "" { logger.Info("Cleaning up task's execution credentials", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), }) mtask.credentialsManager.RemoveCredentials(taskExecutionCredentialsID) } @@ -1505,7 +1505,7 @@ func (mtask *managedTask) waitForStopReported() bool { break } logger.Warn("Blocking cleanup until the task has been reported stopped", logger.Fields{ - field.TaskARN: mtask.Arn, + field.TaskID: mtask.GetID(), field.SentStatus: sentStatus.String(), "Attempt": i + 1, "MaxAttempts": _maxStoppedWaitTimes, diff --git a/agent/engine/task_manager_unix_test.go b/agent/engine/task_manager_unix_test.go index 476656553de..abb0fb62f16 100644 --- a/agent/engine/task_manager_unix_test.go +++ b/agent/engine/task_manager_unix_test.go @@ -428,7 +428,7 @@ func TestCleanupExecEnabledTask(t *testing.T) { dockerContainer := &apicontainer.DockerContainer{ DockerName: "dockerContainer", } - tID, _ := mTask.Task.GetID() + tID := mTask.Task.GetID() removeAll = func(path string) error { assert.Equal(t, fmt.Sprintf("/log/exec/%s", tID), path) return nil diff --git a/agent/eni/watcher/watcher_linux.go b/agent/eni/watcher/watcher_linux.go index 795bf91c5fc..b5a1124f926 100644 --- a/agent/eni/watcher/watcher_linux.go +++ b/agent/eni/watcher/watcher_linux.go @@ -20,6 +20,9 @@ import ( "strings" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + log "github.com/cihub/seelog" "github.com/deniswernert/udev" "github.com/pkg/errors" @@ -109,7 +112,9 @@ func (eniWatcher *ENIWatcher) reconcileOnce(withRetry bool) error { if withRetry { go func(ctx context.Context, macAddress string, timeout time.Duration) { if err := eniWatcher.sendENIStateChangeWithRetries(ctx, macAddress, timeout); err != nil { - log.Infof("ENI watcher event-handler: unable to send state change: %v", err) + logger.Error("Unable to send state ENI change", logger.Fields{ + field.Error: err, + }) } }(eniWatcher.ctx, mac, sendENIStateChangeRetryTimeout) continue @@ -119,9 +124,13 @@ func (eniWatcher *ENIWatcher) reconcileOnce(withRetry bool) error { if strings.Contains(err.Error(), eniStatusSentMsg) { continue } else if _, ok := err.(*unmanagedENIError); ok { - log.Debugf("ENI watcher reconciliation: unable to send state change: %v", err) + logger.Debug("Unable to send state ENI change", logger.Fields{ + field.Error: err, + }) } else { - log.Warnf("ENI watcher reconciliation: unable to send state change: %v", err) + logger.Warn("Unable to send state ENI change", logger.Fields{ + field.Error: err, + }) } } } diff --git a/agent/eventhandler/task_handler_types.go b/agent/eventhandler/task_handler_types.go index f7559c2c032..4df3eedfb3b 100644 --- a/agent/eventhandler/task_handler_types.go +++ b/agent/eventhandler/task_handler_types.go @@ -18,6 +18,9 @@ import ( "fmt" "sync" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/api" apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" @@ -149,18 +152,27 @@ func (event *sendableEvent) send( backoff retry.Backoff, taskEvents *taskSendableEvents) error { - seelog.Infof("TaskHandler: Sending %s change: %s", eventType, event.toString()) + logger.Info("Sending state change to ECS", logger.Fields{ + "eventType": eventType, + "eventData": event.toString(), + }) // Try submitting the change to ECS if err := sendStatusToECS(client, event); err != nil { - seelog.Errorf("TaskHandler: Unretriable error submitting %s state change [%s]: %v", - eventType, event.toString(), err) + logger.Error("Unretriable error sending state change to ECS", logger.Fields{ + "eventType": eventType, + "eventData": event.toString(), + field.Error: err, + }) return err } // submitted; ensure we don't retry it event.setSent() // Mark event as sent setChangeSent(event, dataClient) - seelog.Debugf("TaskHandler: Submitted task state change: %s", event.toString()) + logger.Debug("Submitted state change to ECS", logger.Fields{ + "eventType": eventType, + "eventData": event.toString(), + }) taskEvents.events.Remove(eventToSubmit) backoff.Reset() return nil diff --git a/agent/logger/field/constants.go b/agent/logger/field/constants.go index a056689c8da..18b79d38a99 100644 --- a/agent/logger/field/constants.go +++ b/agent/logger/field/constants.go @@ -14,19 +14,25 @@ package field const ( + TaskID = "task" TaskARN = "taskARN" Container = "container" + DockerId = "dockerId" ManagedAgent = "managedAgent" KnownStatus = "knownStatus" + KnownSent = "knownSent" DesiredStatus = "desiredStatus" SentStatus = "sentStatus" FailedStatus = "failedStatus" - Sequence = "sequence" + Sequence = "seqnum" Reason = "reason" Status = "status" RuntimeID = "runtimeID" + Elapsed = "elapsed" Resource = "resource" Error = "error" Event = "event" Image = "image" + Volume = "volume" + Time = "time" ) diff --git a/agent/stats/engine.go b/agent/stats/engine.go index ef19761c482..7b5ce54dd9e 100644 --- a/agent/stats/engine.go +++ b/agent/stats/engine.go @@ -22,6 +22,9 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/cihub/seelog" "github.com/pborman/uuid" "github.com/pkg/errors" @@ -165,7 +168,10 @@ func (engine *DockerStatsEngine) addAndStartStatsContainer(containerID string) { defer engine.lock.Unlock() statsContainer, statsTaskContainer, err := engine.addContainerUnsafe(containerID) if err != nil { - seelog.Debugf("Adding container to stats watch list failed, container: %s, err: %v", containerID, err) + logger.Debug("Adding container to stats watchlist failed", logger.Fields{ + field.Container: containerID, + field.Error: err, + }) return } @@ -182,7 +188,10 @@ func (engine *DockerStatsEngine) addAndStartStatsContainer(containerID string) { dockerContainer, errResolveContainer := engine.resolver.ResolveContainer(containerID) if errResolveContainer != nil { - seelog.Debugf("Could not map container ID to container, container: %s, err: %s", containerID, err) + logger.Debug("Could not map container ID to container", logger.Fields{ + field.Container: containerID, + field.Error: err, + }) return } @@ -191,7 +200,9 @@ func (engine *DockerStatsEngine) addAndStartStatsContainer(containerID string) { if statsTaskContainer != nil && dockerContainer.Container.Type == apicontainer.ContainerCNIPause { statsTaskContainer.StartStatsCollection() } else { - seelog.Debugf("stats task container is nil, cannot start task stats collection") + logger.Debug("Stats task container is nil, cannot start task stats collection", logger.Fields{ + field.Container: containerID, + }) } } @@ -204,7 +215,7 @@ func (engine *DockerStatsEngine) MustInit(ctx context.Context, taskEngine ecseng engine.ctx = derivedCtx // TODO ensure that this is done only once per engine object - seelog.Info("Initializing stats engine") + logger.Info("Initializing stats engine") engine.cluster = cluster engine.containerInstanceArn = containerInstanceArn @@ -221,7 +232,9 @@ func (engine *DockerStatsEngine) MustInit(ctx context.Context, taskEngine ecseng } err = engine.synchronizeState() if err != nil { - seelog.Warnf("Synchronize the container state failed, err: %v", err) + logger.Warn("Synchronize the container state failed", logger.Fields{ + field.Error: err, + }) } go engine.waitToStop() @@ -243,7 +256,7 @@ func (engine *DockerStatsEngine) Disable() { func (engine *DockerStatsEngine) waitToStop() { // Waiting for the event stream to close <-engine.containerChangeEventStream.Context().Done() - seelog.Debug("Event stream closed, stop listening to the event stream") + logger.Debug("Event stream closed, stop listening to the event stream") engine.containerChangeEventStream.Unsubscribe(containerChangeHandler) engine.removeAll() } @@ -280,7 +293,7 @@ func (engine *DockerStatsEngine) addToStatsTaskMapUnsafe(task *apitask.Task, doc return } containerpid := strconv.Itoa(containerInspect.State.Pid) - statsTaskContainer, err = newStatsTaskContainer(task.Arn, containerpid, numberOfContainers, + statsTaskContainer, err = newStatsTaskContainer(task.Arn, task.GetID(), containerpid, numberOfContainers, engine.resolver, engine.config.PollingMetricsWaitDuration, task.ENIs) if err != nil { return @@ -494,13 +507,18 @@ func (engine *DockerStatsEngine) stopTrackingContainerUnsafe(container *StatsCon // docker task engine. If the docker task engine has already removed // the container from its state, there's no point in stats engine tracking the // container. So, clean-up anyway. - seelog.Warnf("Error determining if the container %s is terminal, removing from stats, err: %v", container.containerMetadata.DockerID, err) + logger.Warn("Error determining if the container is terminal, removing from stats", logger.Fields{ + field.Container: container.containerMetadata.DockerID, + field.Error: err, + }) engine.doRemoveContainerUnsafe(container, taskARN) return true } if terminal { // Container is in known terminal state. Stop collection metrics. - seelog.Infof("Container %s is terminal, removing from stats", container.containerMetadata.DockerID) + logger.Info("Container is terminal, removing from stats", logger.Fields{ + field.Container: container.containerMetadata.DockerID, + }) engine.doRemoveContainerUnsafe(container, taskARN) return true } @@ -657,12 +675,18 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* // CPU and Memory are both critical, so skip the container if either of these fail. cpuStatsSet, err := container.statsQueue.GetCPUStatsSet() if err != nil { - seelog.Infof("cloudwatch metrics for container %v not collected, reason (cpu): %v", dockerID, err) + logger.Error("Error collecting cloudwatch metrics for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) continue } memoryStatsSet, err := container.statsQueue.GetMemoryStatsSet() if err != nil { - seelog.Infof("cloudwatch metrics for container %v not collected, reason (memory): %v", dockerID, err) + logger.Error("Error collecting cloudwatch metrics for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) continue } @@ -674,14 +698,20 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* storageStatsSet, err := container.statsQueue.GetStorageStatsSet() if err != nil { - seelog.Warnf("Error getting storage stats, err: %v, container: %v", err, dockerID) + logger.Warn("Error getting storage stats for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) } else { containerMetric.StorageStatsSet = storageStatsSet } task, err := engine.resolver.ResolveTask(dockerID) if err != nil { - seelog.Warnf("Task not found for container ID: %s", dockerID) + logger.Warn("Task not found for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) } else { // send network stats for default/bridge/nat/awsvpc network modes if !task.IsNetworkModeAWSVPC() && container.containerMetadata.NetworkMode != hostNetworkMode && @@ -689,7 +719,10 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* networkStatsSet, err := container.statsQueue.GetNetworkStatsSet() if err != nil { // we log the error and still continue to publish cpu, memory stats - seelog.Warnf("Error getting network stats: %v, container: %v", err, dockerID) + logger.Warn("Error getting network stats for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) } else { containerMetric.NetworkStatsSet = networkStatsSet } @@ -705,7 +738,11 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* if dockerContainer.Container.Type != apicontainer.ContainerCNIPause { networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet() if err != nil { - seelog.Warnf("error getting network stats: %v, task: %v", err, taskArn) + logger.Warn("Error getting network stats for container", logger.Fields{ + field.TaskARN: taskArn, + field.Container: dockerContainer.DockerID, + field.Error: err, + }) } else { containerMetric.NetworkStatsSet = networkStats } @@ -788,7 +825,11 @@ func (engine *DockerStatsEngine) ContainerDockerStats(taskARN string, containerI } containerNetworkRateStats = taskStats.StatsQueue.GetLastNetworkStatPerSec() } else { - seelog.Warnf("Network stats not found for container %s", containerID) + logger.Warn("Network stats not found for container", logger.Fields{ + field.TaskID: task.GetID(), + field.Container: containerID, + field.Error: err, + }) } } diff --git a/agent/stats/task.go b/agent/stats/task.go index b8cdea6fcbd..fb692c03ae5 100644 --- a/agent/stats/task.go +++ b/agent/stats/task.go @@ -18,12 +18,14 @@ import ( "fmt" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + apitaskstatus "github.com/aws/amazon-ecs-agent/agent/api/task/status" "github.com/aws/amazon-ecs-agent/agent/config" "github.com/aws/amazon-ecs-agent/agent/stats/resolver" "github.com/aws/amazon-ecs-agent/agent/utils/retry" - "github.com/cihub/seelog" "github.com/docker/docker/api/types" dockerstats "github.com/docker/docker/api/types" ) @@ -51,31 +53,40 @@ func (taskStat *StatsTask) StopStatsCollection() { } func (taskStat *StatsTask) collect() { - taskArn := taskStat.TaskMetadata.TaskArn + taskId := taskStat.TaskMetadata.TaskId backoff := retry.NewExponentialBackoff(time.Second*1, time.Second*10, 0.5, 2) for { err := taskStat.processStatsStream() select { case <-taskStat.Ctx.Done(): - seelog.Debugf("Stopping stats collection for taskStat %s", taskArn) + logger.Debug("Stopping stats collection for taskStat", logger.Fields{ + field.TaskID: taskId, + }) return default: if err != nil { d := backoff.Duration() time.Sleep(d) - seelog.Debugf("Error querying stats for task %s: %v", taskArn, err) + logger.Debug("Error querying stats for task", logger.Fields{ + field.TaskID: taskId, + field.Error: err, + }) } // We were disconnected from the stats stream. // Check if the task is terminal. If it is, stop collecting metrics. terminal, err := taskStat.terminal() if err != nil { // Error determining if the task is terminal. clean-up anyway. - seelog.Warnf("Error determining if the task %s is terminal, stopping stats collection: %v", - taskArn, err) + logger.Warn("Error determining if the task is terminal, stopping stats collection", logger.Fields{ + field.TaskID: taskId, + field.Error: err, + }) taskStat.StopStatsCollection() } else if terminal { - seelog.Infof("Task %s is terminal, stopping stats collection", taskArn) + logger.Warn("Task is terminal, stopping stats collection", logger.Fields{ + field.TaskID: taskId, + }) taskStat.StopStatsCollection() } } @@ -83,18 +94,22 @@ func (taskStat *StatsTask) collect() { } func (taskStat *StatsTask) processStatsStream() error { - taskArn := taskStat.TaskMetadata.TaskArn + taskId := taskStat.TaskMetadata.TaskId awsvpcNetworkStats, errC := taskStat.getAWSVPCNetworkStats() returnError := false for { select { case <-taskStat.Ctx.Done(): - seelog.Info("task context is done") + logger.Info("Task context is done", logger.Fields{ + field.TaskID: taskId, + }) return nil case err := <-errC: - seelog.Warnf("Error encountered processing metrics stream from host, this may affect "+ - "cloudwatch metric accuracy: %s", err) + logger.Warn("Error encountered processing metrics stream from host, this may affect cloudwatch metric accuracy", logger.Fields{ + field.TaskID: taskId, + field.Error: err, + }) returnError = true case rawStat, ok := <-awsvpcNetworkStats: if !ok { @@ -104,7 +119,10 @@ func (taskStat *StatsTask) processStatsStream() error { return nil } if err := taskStat.StatsQueue.Add(rawStat); err != nil { - seelog.Warnf("Task [%s]: error converting stats: %v", taskArn, err) + logger.Warn("Error converting task stats", logger.Fields{ + field.TaskID: taskId, + field.Error: err, + }) } } diff --git a/agent/stats/task_linux.go b/agent/stats/task_linux.go index ca07c3a7571..b7374a6f4bd 100644 --- a/agent/stats/task_linux.go +++ b/agent/stats/task_linux.go @@ -48,7 +48,7 @@ type StatsTask struct { netlinkinterface netlinkwrapper.NetLink } -func newStatsTaskContainer(taskARN string, containerPID string, numberOfContainers int, +func newStatsTaskContainer(taskARN, taskId, containerPID string, numberOfContainers int, resolver resolver.ContainerMetadataResolver, publishInterval time.Duration, _ task.TaskENIs) (*StatsTask, error) { nsAgent := nswrapper.NewNS() netlinkclient := netlinkwrapper.New() @@ -58,6 +58,7 @@ func newStatsTaskContainer(taskARN string, containerPID string, numberOfContaine statsTaskCommon: &statsTaskCommon{ TaskMetadata: &TaskMetadata{ TaskArn: taskARN, + TaskId: taskId, ContainerPID: containerPID, NumberContainers: numberOfContainers, }, diff --git a/agent/stats/task_unspecified.go b/agent/stats/task_unspecified.go index c7ed848c4ad..bdb6e442468 100644 --- a/agent/stats/task_unspecified.go +++ b/agent/stats/task_unspecified.go @@ -31,7 +31,7 @@ type StatsTask struct { *statsTaskCommon } -func newStatsTaskContainer(taskARN string, containerPID string, numberOfContainers int, +func newStatsTaskContainer(taskARN, taskId, containerPID string, numberOfContainers int, resolver resolver.ContainerMetadataResolver, publishInterval time.Duration, _ task.TaskENIs) (*StatsTask, error) { return nil, errors.New("Unsupported platform") } diff --git a/agent/stats/task_windows.go b/agent/stats/task_windows.go index 4bbc84fbaa9..165cab6ca02 100644 --- a/agent/stats/task_windows.go +++ b/agent/stats/task_windows.go @@ -68,7 +68,7 @@ type StatsTask struct { *statsTaskCommon } -func newStatsTaskContainer(taskARN string, containerPID string, numberOfContainers int, +func newStatsTaskContainer(taskARN, taskId, containerPID string, numberOfContainers int, resolver resolver.ContainerMetadataResolver, publishInterval time.Duration, taskENIs task.TaskENIs) (*StatsTask, error) { ctx, cancel := context.WithCancel(context.Background()) diff --git a/agent/stats/types.go b/agent/stats/types.go index 113a4213fac..bc1540149ce 100644 --- a/agent/stats/types.go +++ b/agent/stats/types.go @@ -70,6 +70,7 @@ type ContainerMetadata struct { // TaskMetadata contains meta-data information for a task. type TaskMetadata struct { TaskArn string `json:"-"` + TaskId string `json:"-"` // ContainerPID is the PID of the pause container in the awsvpc task. ContainerPID string `json:"-"` DeviceName []string `json:"-"` diff --git a/agent/taskresource/asmsecret/asmsecret.go b/agent/taskresource/asmsecret/asmsecret.go index 8ace7eb21c1..f8fcd3ac66f 100644 --- a/agent/taskresource/asmsecret/asmsecret.go +++ b/agent/taskresource/asmsecret/asmsecret.go @@ -269,7 +269,7 @@ func (secret *ASMSecretResource) Create() error { // Get the maximum number of errors to be returned, which will be one error per goroutine errorEvents := make(chan error, len(secret.requiredSecrets)) - seelog.Infof("ASM secret resource: retrieving secrets for containers in task: [%s]", secret.taskARN) + seelog.Debugf("ASM secret resource: retrieving secrets for containers in task: [%s]", secret.taskARN) secret.secretData = make(map[string]string) for _, asmsecret := range secret.getRequiredSecrets() { @@ -300,7 +300,7 @@ func (secret *ASMSecretResource) retrieveASMSecretValue(apiSecret apicontainer.S defer wg.Done() asmClient := secret.asmClientCreator.NewASMClient(apiSecret.Region, iamCredentials) - seelog.Infof("ASM secret resource: retrieving resource for secret %v in region %s for task: [%s]", apiSecret.ValueFrom, apiSecret.Region, secret.taskARN) + seelog.Debugf("ASM secret resource: retrieving resource for secret %v in region %s for task: [%s]", apiSecret.ValueFrom, apiSecret.Region, secret.taskARN) input, jsonKey, err := getASMParametersFromInput(apiSecret.ValueFrom) if err != nil { errorEvents <- fmt.Errorf("trying to retrieve secret with value %s resulted in error: %v", apiSecret.ValueFrom, err) diff --git a/agent/taskresource/cgroup/control/init_linux.go b/agent/taskresource/cgroup/control/init_linux.go index f06a15ca302..a29da1df472 100644 --- a/agent/taskresource/cgroup/control/init_linux.go +++ b/agent/taskresource/cgroup/control/init_linux.go @@ -17,14 +17,16 @@ package control import ( "github.com/aws/amazon-ecs-agent/agent/config" + "github.com/aws/amazon-ecs-agent/agent/logger" - "github.com/cihub/seelog" specs "github.com/opencontainers/runtime-spec/specs-go" ) // Init is used to setup the cgroup root for ecs func (c *control) Init() error { - seelog.Infof("Creating root ecs cgroup: %s", config.DefaultTaskCgroupPrefix) + logger.Info("Creating root ecs cgroup", logger.Fields{ + "cgroup": config.DefaultTaskCgroupPrefix, + }) // Build cgroup spec cgroupSpec := &Spec{ diff --git a/agent/taskresource/ssmsecret/ssmsecret.go b/agent/taskresource/ssmsecret/ssmsecret.go index 7a921832a52..d9ff06a52c2 100644 --- a/agent/taskresource/ssmsecret/ssmsecret.go +++ b/agent/taskresource/ssmsecret/ssmsecret.go @@ -335,7 +335,7 @@ func (secret *SSMSecretResource) retrieveSSMSecretValues(region string, names [] defer wg.Done() ssmClient := secret.ssmClientCreator.NewSSMClient(region, iamCredentials) - seelog.Infof("ssm secret resource: retrieving resource for secrets %v in region [%s] in task: [%s]", names, region, secret.taskARN) + seelog.Debugf("ssm secret resource: retrieving resource for secrets %v in region [%s] in task: [%s]", names, region, secret.taskARN) secValueMap, err := ssm.GetSecretsFromSSM(names, ssmClient) if err != nil { errorEvents <- fmt.Errorf("fetching secret data from SSM Parameter Store in %s: %v", region, err) diff --git a/agent/utils/arn.go b/agent/utils/arn.go new file mode 100644 index 00000000000..96a8a01318b --- /dev/null +++ b/agent/utils/arn.go @@ -0,0 +1,33 @@ +package utils + +import ( + "strings" + + "github.com/aws/aws-sdk-go/aws/arn" + "github.com/pkg/errors" +) + +const ( + arnResourceDelimiter = "/" +) + +// TaskIdFromArn derives the task id from the task arn +// Reference: http://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html#arn-syntax-ecs +func TaskIdFromArn(taskArn string) (string, error) { + // Parse taskARN + parsedARN, err := arn.Parse(taskArn) + if err != nil { + return "", err + } + + // Get task resource section + resource := parsedARN.Resource + + if !strings.Contains(resource, arnResourceDelimiter) { + return "", errors.New("malformed task ARN resource") + } + + resourceSplit := strings.Split(resource, arnResourceDelimiter) + + return resourceSplit[len(resourceSplit)-1], nil +} diff --git a/agent/wsclient/client.go b/agent/wsclient/client.go index 75b43c07efc..fbc6c450394 100644 --- a/agent/wsclient/client.go +++ b/agent/wsclient/client.go @@ -33,6 +33,8 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "crypto/tls" "github.com/aws/amazon-ecs-agent/agent/config" @@ -145,7 +147,9 @@ type MakeRequestHookFunc func([]byte) ([]byte, error) // 'MakeRequest' can be made after calling this, but responses will not be // receivable until 'Serve' is also called. func (cs *ClientServerImpl) Connect() error { - seelog.Infof("Establishing a Websocket connection to %s", cs.URL) + logger.Info("Establishing a Websocket connection", logger.Fields{ + "url": cs.URL, + }) parsedURL, err := url.Parse(cs.URL) if err != nil { return err