Skip to content

Commit

Permalink
feat: allow disabling network and/or storage metric collection only
Browse files Browse the repository at this point in the history
Since not all metrics are likely to be of the same relevance, it would
be beneficial from a cost perspective to allow disabling some of them,
namely Network IO and Block IO.

This change introduces the ability to disable network and storage stats
collection via ECS_DISABLE_NETWORK_METRICS and
ECS_DISABLE_STORAGE_METRICS respectively.
  • Loading branch information
weeniearms committed Nov 11, 2024
1 parent 12cb750 commit 409dbe7
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 66 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ additional details on each available environment variable.
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | false |
| `ECS_DISABLE_STORAGE_METRICS` | <true | false> | Whether to disable storage metrics gathering for tasks. | false | false |
| `ECS_DISABLE_NETWORK_METRICS` | <true | false> | Whether to disable network metrics gathering for tasks. | false | false |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. Setting this value to `true` can help reduce the CPU usage of dockerd and containerd on the ECS container instance. See also ECS_POLL_METRICS_WAIT_DURATION for setting the poll interval. | `false` | `false` |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. As the number of tasks/containers increase, a higher `ECS_POLLING_METRICS_WAIT_DURATION` value can potentially cause a problem where memory reservation value of ECS cluster reported in metrics becomes unstable due to missing metrics sample at metric collection time. It is recommended to keep this value smaller than 18s. This behavior is only observed on certain OS and platforms. | 10s | 10s |
| `ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT` | <true | false> | Whether to pull images for containers with dependencies before the dependsOn condition has been satisfied. | false | false |
Expand Down
6 changes: 6 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,8 @@ func environmentConfig() (Config, error) {
UpdatesEnabled: parseBooleanDefaultFalseConfig("ECS_UPDATES_ENABLED"),
UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"),
DisableMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_METRICS"),
DisableNetworkMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_NETWORK_METRICS"),
DisableStorageMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_STORAGE_METRICS"),
ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"),
AvailableLoggingDrivers: parseAvailableLoggingDrivers(),
PrivilegedDisabled: parseBooleanDefaultFalseConfig("ECS_DISABLE_PRIVILEGED"),
Expand Down Expand Up @@ -626,6 +628,8 @@ func (cfg *Config) String() string {
"AuthType: %v, "+
"UpdatesEnabled: %v, "+
"DisableMetrics: %v, "+
"DisableNetworkMetrics: %v, "+
"DisableStorageMetrics: %v, "+
"PollMetrics: %v, "+
"PollingMetricsWaitDuration: %v, "+
"ReservedMem: %v, "+
Expand All @@ -646,6 +650,8 @@ func (cfg *Config) String() string {
cfg.EngineAuthType,
cfg.UpdatesEnabled,
cfg.DisableMetrics,
cfg.DisableNetworkMetrics,
cfg.DisableStorageMetrics,
cfg.PollMetrics,
cfg.PollingMetricsWaitDuration,
cfg.ReservedMemory,
Expand Down
4 changes: 4 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,14 @@ func TestConfigBoolean(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_DISABLE_DOCKER_HEALTH_CHECK", "true")()
defer setTestEnv("ECS_DISABLE_METRICS", "true")()
defer setTestEnv("ECS_DISABLE_NETWORK_METRICS", "true")()
defer setTestEnv("ECS_DISABLE_STORAGE_METRICS", "true")()
defer setTestEnv("ECS_ENABLE_SPOT_INSTANCE_DRAINING", "true")()
cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.True(t, cfg.DisableMetrics.Enabled())
assert.True(t, cfg.DisableNetworkMetrics.Enabled())
assert.True(t, cfg.DisableStorageMetrics.Enabled())
assert.True(t, cfg.DisableDockerHealthCheck.Enabled())
assert.True(t, cfg.SpotInstanceDrainingEnabled.Enabled())
}
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ func DefaultConfig() Config {
DataDir: "/data/",
DataDirOnHost: "/var/lib/ecs",
DisableMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
DisableNetworkMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
DisableStorageMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver},
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, "unix:///var/run/docker.sock", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
assert.Equal(t, "/data/", cfg.DataDir, "Default datadir set incorrectly")
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
assert.Equal(t, 5, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, "npipe:////./pipe/docker_engine", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
assert.Equal(t, `C:\ProgramData\Amazon\ECS\data`, cfg.DataDir, "Default datadir set incorrectly")
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
assert.Equal(t, 11, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")
Expand Down
8 changes: 8 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ type Config struct {
// sent to the ECS telemetry endpoint
DisableMetrics BooleanDefaultFalse

// DisableNetworkMetrics configures whether task network IO utilization metrics should be
// sent to the ECS telemetry endpoint
DisableNetworkMetrics BooleanDefaultFalse

// DisableStorageMetrics configures whether task block IO utilization metrics should be
// sent to the ECS telemetry endpoint
DisableStorageMetrics BooleanDefaultFalse

// PollMetrics configures whether metrics are constantly streamed for each container or
// polled on interval instead.
PollMetrics BooleanDefaultFalse
Expand Down
102 changes: 53 additions & 49 deletions agent/stats/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -864,14 +864,16 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
MemoryStatsSet: memoryStatsSet,
}

storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting storage stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.StorageStatsSet = storageStatsSet
if !engine.config.DisableStorageMetrics.Enabled() {
storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting storage stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.StorageStatsSet = storageStatsSet
}
}

restartStatsSet, err := container.statsQueue.GetRestartStatsSet()
Expand All @@ -887,51 +889,53 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
containerMetric.RestartStatsSet = restartStatsSet
}

task, err := engine.resolver.ResolveTask(dockerID)
if err != nil {
logger.Warn("Task not found for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
logger.Warn("Could not map container ID to container, container", logger.Fields{
field.DockerId: dockerID,
field.Error: err,
if !engine.config.DisableNetworkMetrics.Enabled() {
task, err := engine.resolver.ResolveTask(dockerID)
if err != nil {
logger.Warn("Task not found for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
// send network stats for default/bridge/nat/awsvpc network modes
if task.IsNetworkModeBridge() {
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
} else {
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
// we log the error and still continue to publish cpu, memory stats
logger.Warn("Error getting network stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
logger.Warn("Could not map container ID to container, container", logger.Fields{
field.DockerId: dockerID,
field.Error: err,
})
} else {
// send network stats for default/bridge/nat/awsvpc network modes
if task.IsNetworkModeBridge() {
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
} else {
containerMetric.NetworkStatsSet = networkStatsSet
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
// we log the error and still continue to publish cpu, memory stats
logger.Warn("Error getting network stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStatsSet
}
}
}
} else if task.IsNetworkModeAWSVPC() {
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
if !taskExistsInTaskStats {
return nil, fmt.Errorf("task not found")
}
// do not add network stats for pause container
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting network stats for container", logger.Fields{
field.TaskARN: taskArn,
field.Container: dockerContainer.DockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStats
} else if task.IsNetworkModeAWSVPC() {
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
if !taskExistsInTaskStats {
return nil, fmt.Errorf("task not found")
}
// do not add network stats for pause container
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting network stats for container", logger.Fields{
field.TaskARN: taskArn,
field.Container: dockerContainer.DockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStats
}
}
}
}
Expand Down
31 changes: 22 additions & 9 deletions agent/stats/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,20 +724,22 @@ func TestSynchronizeOnRestart(t *testing.T) {

func TestTaskNetworkStatsSet(t *testing.T) {
var networkModes = []struct {
ENIs []*ni.NetworkInterface
NetworkMode string
ServiceConnectEnabled bool
StatsEmpty bool
ENIs []*ni.NetworkInterface
NetworkMode string
ServiceConnectEnabled bool
NetworkMetricsDisabled bool
StatsEmpty bool
}{
{nil, DefaultNetworkMode, false, false},
{nil, DefaultNetworkMode, true, true},
{nil, DefaultNetworkMode, false, false, false},
{nil, DefaultNetworkMode, true, false, true},
{nil, DefaultNetworkMode, false, true, true},
}
for _, tc := range networkModes {
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.StatsEmpty)
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.NetworkMetricsDisabled, tc.StatsEmpty)
}
}

func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, emptyStats bool) {
func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, networkMetricsDisabled, emptyStats bool) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
resolver := mock_resolver.NewMockContainerMetadataResolver(mockCtrl)
Expand Down Expand Up @@ -782,7 +784,18 @@ func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterf
State: &types.ContainerState{Pid: 23},
},
}, nil).AnyTimes()
engine := NewDockerStatsEngine(&cfg, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)

var c *config.Config
if networkMetricsDisabled {
c = &config.Config{
DisableNetworkMetrics: config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled},
}
c = c.Merge(cfg)
} else {
c = &cfg
}

engine := NewDockerStatsEngine(c, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
engine.ctx = ctx
Expand Down
18 changes: 10 additions & 8 deletions agent/stats/engine_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,19 @@ const (

func TestLinuxTaskNetworkStatsSet(t *testing.T) {
var networkModes = []struct {
ENIs []*ni.NetworkInterface
NetworkMode string
StatsEmpty bool
ENIs []*ni.NetworkInterface
NetworkMode string
NetworkMetricsDisabled bool
StatsEmpty bool
}{
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", true},
{nil, "host", true},
{nil, "bridge", false},
{nil, "none", true},
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", false, true},
{nil, "host", false, true},
{nil, "bridge", false, false},
{nil, "bridge", true, true},
{nil, "none", false, true},
}
for _, tc := range networkModes {
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.StatsEmpty)
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.NetworkMetricsDisabled, tc.StatsEmpty)
}
}

Expand Down

0 comments on commit 409dbe7

Please sign in to comment.