Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change default docker metric gathering behavior #2452

Merged
merged 3 commits into from
May 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ additional details on each available environment variable.
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. This defaulted to `false` previous to agent version 1.40.0. WARNING: setting this to false on an instance with many containers can result in very high CPU utilization by the agent, dockerd, and containerd. | `true` | `true` |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. | 10s | 10s |
| `ECS_RESERVED_MEMORY` | 32 | Memory, in MiB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 |
| `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` |
| `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` |
Expand Down
18 changes: 10 additions & 8 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const (

// DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration
// This is only used when PollMetrics is set to true
DefaultPollingMetricsWaitDuration = 15 * time.Second
DefaultPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval / 2

// defaultDockerStopTimeout specifies the value for container stop timeout duration
defaultDockerStopTimeout = 30 * time.Second
Expand Down Expand Up @@ -97,11 +97,11 @@ const (

// minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
minimumPollingMetricsWaitDuration = 1 * time.Second
minimumPollingMetricsWaitDuration = 5 * time.Second

// maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
maximumPollingMetricsWaitDuration = 20 * time.Second
maximumPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval

// minimumDockerStopTimeout specifies the minimum value for docker StopContainer API
minimumDockerStopTimeout = 1 * time.Second
Expand Down Expand Up @@ -350,13 +350,15 @@ func (cfg *Config) validateAndOverrideBounds() error {
func (cfg *Config) pollMetricsOverrides() {
if cfg.PollMetrics {
if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is less than the minimum of %s. Setting polling interval to minimum.",
cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = minimumPollingMetricsWaitDuration
}

if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is greater than the maximum of %s. Setting polling interval to maximum.",
cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = maximumPollingMetricsWaitDuration
}
}
}
Expand Down Expand Up @@ -547,7 +549,7 @@ func environmentConfig() (Config, error) {
SharedVolumeMatchFullConfig: utils.ParseBool(os.Getenv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG"), false),
ContainerInstanceTags: containerInstanceTags,
ContainerInstancePropagateTagsFrom: parseContainerInstancePropagateTagsFrom(),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), true),
PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"),
DisableDockerHealthCheck: utils.ParseBool(os.Getenv("ECS_DISABLE_DOCKER_HEALTH_CHECK"), false),
GPUSupportEnabled: utils.ParseBool(os.Getenv("ECS_ENABLE_GPU_SUPPORT"), false),
Expand Down
15 changes: 12 additions & 3 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ func TestInvalidLoggingDriver(t *testing.T) {
func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) {
conf, err := environmentConfig()
assert.NoError(t, err)
assert.False(t, conf.PollMetrics)
assert.True(t, conf.PollMetrics)
}

func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) {
Expand Down Expand Up @@ -357,16 +357,25 @@ func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) {
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, maximumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "1s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, minimumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValuePollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, DefaultPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidFormatParseEnvVariableUint16(t *testing.T) {
Expand Down
72 changes: 36 additions & 36 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1369,56 +1369,56 @@ func (dg *dockerGoClient) Stats(ctx context.Context, id string, inactivityTimeou
}()
Copy link
Contributor Author

@sparrc sparrc May 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes here are because functional tests caught that polling metrics was taking longer to populate the task stats API versus streaming stats. This is because polling stats attempts to jitter the initial stats poll in order to avoid hammering the docker stats API on startup.

So change the behavior of polling metrics to poll for docker stats immediately on startup. This is not ideal from a load perspective but is a necessity to avoid unintentional behavior changes.

} else {
seelog.Infof("DockerGoClient: Starting to Poll for metrics for container %s", id)
// firstStatC jitters the time at which containers ask for their first stat.
// We use this channel to 'seed' the queue with docker stats so that containers
// can publish metrics more quickly.
firstStatC := time.After(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration/2))
// sleeping here jitters the time at which the ticker is created, so that
// containers do not synchronize on calling the docker stats api.
// the max sleep is 80% of the polling interval so that we have a chance to
// get two stats in the first publishing interval.
time.Sleep(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration*8/10))
statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration)

go func() {
defer cancelRequest()
defer close(statsC)
// we need to start by getting container stats so that the task stats
// endpoint will be populated immediately.
stats, err := getContainerStatsNotStreamed(client, subCtx, id)
if err != nil {
errC <- err
return
}
statsC <- stats

// sleeping here jitters the time at which the ticker is created, so that
// containers do not synchronize on calling the docker stats api.
// the max sleep is 80% of the polling interval so that we have a chance to
// get two stats in the first publishing interval.
time.Sleep(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration*8/10))
statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration)
defer statPollTicker.Stop()

for {
// this select statement is waiting on either the stat polling ticker channel
// or the firstStat time.After channel to fire. firstStat will fire
// first and then afterwards we will always grab stats on the ticker.
select {
case _, ok := <-statPollTicker.C:
if !ok {
return
}
case <-firstStatC:
}

stream := false
resp, err = client.ContainerStats(subCtx, id, stream)
if err != nil {
errC <- fmt.Errorf("DockerGoClient: Unable to retrieve stats for container %s: %v", id, err)
return
}

decoder := json.NewDecoder(resp.Body)
data := new(types.StatsJSON)
err := decoder.Decode(data)
for range statPollTicker.C {
stats, err := getContainerStatsNotStreamed(client, subCtx, id)
if err != nil {
errC <- fmt.Errorf("DockerGoClient: Unable to decode stats for container %s: %v", id, err)
errC <- err
return
}

statsC <- data
statsC <- stats
}
}()
}

return statsC, errC
}

func getContainerStatsNotStreamed(client sdkclient.Client, ctx context.Context, id string) (*types.StatsJSON, error) {
resp, err := client.ContainerStats(ctx, id, false)
if err != nil {
return nil, fmt.Errorf("DockerGoClient: Unable to retrieve stats for container %s: %v", id, err)
}

decoder := json.NewDecoder(resp.Body)
stats := &types.StatsJSON{}
err = decoder.Decode(stats)
if err != nil {
return nil, fmt.Errorf("DockerGoClient: Unable to decode stats for container %s: %v", id, err)
}

return stats, nil
}

func (dg *dockerGoClient) RemoveImage(ctx context.Context, imageName string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
Expand Down
8 changes: 4 additions & 4 deletions scripts/run-integ-tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ Param (
)

if ($Platform -like "windows2016") {
$BaseImageName="mcr.microsoft.com/windows/servercore:ltsc2016"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:91368f3cff77ad42259ccb3bf3d1a4e145cf5fa9e486f23999d32711c2913f3e"
$BaseImageName="mcr.microsoft.com/windows/servercore@sha256:42be24b8810c861cc1b3fe75c5e99f75061cb45fdbae1de46d151c18cc8e6a9a"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:42be24b8810c861cc1b3fe75c5e99f75061cb45fdbae1de46d151c18cc8e6a9a"
} elseif ($Platform -like "windows2019") {
$BaseImageName="mcr.microsoft.com/windows/servercore:ltsc2019"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:e20960b4c06acee08af55164e3abc37b39cdc128ce2f5fcdf3397c738cb91069"
$BaseImageName="mcr.microsoft.com/windows/servercore@sha256:cc6d6da31014dceab4daee8b5a8da4707233f4ef42eaf071e45cee044ac738f4"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:cc6d6da31014dceab4daee8b5a8da4707233f4ef42eaf071e45cee044ac738f4"
} else {
echo "Invalid platform parameter"
exit 1
Expand Down