Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CGroups CPU period configurable #1941

Merged
merged 1 commit into from
Apr 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ additional details on each available environment variable.
| `ECS_HOST_DATA_DIR` | `/var/lib/ecs` | The source directory on the host from which ECS_DATADIR is mounted. We use this to determine the source mount path for container metadata files in the case the ECS Agent is running as a container. We do not use this value in Windows because the ECS Agent is not running as container in Windows. | `/var/lib/ecs` | `Not used` |
| `ECS_ENABLE_TASK_CPU_MEM_LIMIT` | `true` | Whether to enable task-level cpu and memory limits | `true` | `false` |
| `ECS_CGROUP_PATH` | `/sys/fs/cgroup` | The root cgroup path that is expected by the ECS agent. This is the path that accessible from the agent mount. | `/sys/fs/cgroup` | Not applicable |
| `ECS_CGROUP_CPU_PERIOD` | `10ms` | CGroups CPU period for task level limits. This value should be between 8ms to 100ms | `100ms` | Not applicable |
| `ECS_ENABLE_CPU_UNBOUNDED_WINDOWS_WORKAROUND` | `true` | When `true`, ECS will allow CPU unbounded(CPU=`0`) tasks to run along with CPU bounded tasks in Windows. | Not applicable | `false` |
| `ECS_TASK_METADATA_RPS_LIMIT` | `100,150` | Comma separated integer values for steady state and burst throttle limits for task metadata endpoint | `40,60` | `40,60` |
| `ECS_SHARED_VOLUME_MATCH_FULL_CONFIG` | `true` | When `true`, ECS Agent will compare name, driver options, and labels to make sure volumes are identical. When `false`, Agent will short circuit shared volume comparison if the names match. This is the default Docker behavior. If a volume is shared across instances, this should be set to `false`. | `false` | `false`|
Expand Down
2 changes: 1 addition & 1 deletion agent/api/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config,
// hook into this
task.adjustForPlatform(cfg)
if task.MemoryCPULimitsEnabled {
err := task.initializeCgroupResourceSpec(cfg.CgroupPath, resourceFields)
err := task.initializeCgroupResourceSpec(cfg.CgroupPath, cfg.CgroupCPUPeriod, resourceFields)
if err != nil {
seelog.Errorf("Task [%s]: could not intialize resource: %v", task.Arn, err)
return apierrors.NewResourceInitError(task.Arn, err)
Expand Down
12 changes: 6 additions & 6 deletions agent/api/task/task_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ func (task *Task) adjustForPlatform(cfg *config.Config) {
task.MemoryCPULimitsEnabled = cfg.TaskCPUMemLimit.Enabled()
}

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
cgroupRoot, err := task.BuildCgroupRoot()
if err != nil {
return errors.Wrapf(err, "cgroup resource: unable to determine cgroup root for task")
}
resSpec, err := task.BuildLinuxResourceSpec()
resSpec, err := task.BuildLinuxResourceSpec(cGroupCPUPeriod)
if err != nil {
return errors.Wrapf(err, "cgroup resource: unable to build resource spec for task")
}
Expand Down Expand Up @@ -85,13 +85,13 @@ func (task *Task) BuildCgroupRoot() (string, error) {
}

// BuildLinuxResourceSpec returns a linuxResources object for the task cgroup
func (task *Task) BuildLinuxResourceSpec() (specs.LinuxResources, error) {
func (task *Task) BuildLinuxResourceSpec(cGroupCPUPeriod time.Duration) (specs.LinuxResources, error) {
linuxResourceSpec := specs.LinuxResources{}

// If task level CPU limits are requested, set CPU quota + CPU period
// Else set CPU shares
if task.CPU > 0 {
linuxCPUSpec, err := task.buildExplicitLinuxCPUSpec()
linuxCPUSpec, err := task.buildExplicitLinuxCPUSpec(cGroupCPUPeriod)
if err != nil {
return specs.LinuxResources{}, err
}
Expand All @@ -116,13 +116,13 @@ func (task *Task) BuildLinuxResourceSpec() (specs.LinuxResources, error) {

// buildExplicitLinuxCPUSpec builds CPU spec when task CPU limits are
// explicitly requested
func (task *Task) buildExplicitLinuxCPUSpec() (specs.LinuxCPU, error) {
func (task *Task) buildExplicitLinuxCPUSpec(cGroupCPUPeriod time.Duration) (specs.LinuxCPU, error) {
if task.CPU > maxTaskVCPULimit {
return specs.LinuxCPU{},
errors.Errorf("task CPU spec builder: unsupported CPU limits, requested=%f, max-supported=%d",
task.CPU, maxTaskVCPULimit)
}
taskCPUPeriod := uint64(defaultCPUPeriod / time.Microsecond)
taskCPUPeriod := uint64(cGroupCPUPeriod / time.Microsecond)
taskCPUQuota := int64(task.CPU * float64(taskCPUPeriod))

// TODO: DefaultCPUPeriod only permits 10VCPUs.
Expand Down
16 changes: 8 additions & 8 deletions agent/api/task/task_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func TestBuildLinuxResourceSpecCPUMem(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -272,7 +272,7 @@ func TestBuildLinuxResourceSpecCPU(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -295,7 +295,7 @@ func TestBuildLinuxResourceSpecWithoutTaskCPULimits(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -319,7 +319,7 @@ func TestBuildLinuxResourceSpecWithoutTaskCPUWithContainerCPULimits(t *testing.T
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -342,7 +342,7 @@ func TestBuildLinuxResourceSpecInvalidMem(t *testing.T) {
}

expectedLinuxResourceSpec := specs.LinuxResources{}
linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.Error(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand Down Expand Up @@ -488,7 +488,7 @@ func TestInitCgroupResourceSpecHappyPath(t *testing.T) {
defer ctrl.Finish()
mockControl := mock_control.NewMockControl(ctrl)
mockIO := mock_ioutilwrapper.NewMockIOUtil(ctrl)
assert.NoError(t, task.initializeCgroupResourceSpec("cgroupPath", &taskresource.ResourceFields{
assert.NoError(t, task.initializeCgroupResourceSpec("cgroupPath", defaultCPUPeriod, &taskresource.ResourceFields{
Control: mockControl,
ResourceFieldsCommon: &taskresource.ResourceFieldsCommon{
IOUtil: mockIO,
Expand All @@ -512,7 +512,7 @@ func TestInitCgroupResourceSpecInvalidARN(t *testing.T) {
MemoryCPULimitsEnabled: true,
ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource),
}
assert.Error(t, task.initializeCgroupResourceSpec("", nil))
assert.Error(t, task.initializeCgroupResourceSpec("", time.Millisecond, nil))
assert.Equal(t, 0, len(task.GetResources()))
assert.Equal(t, 0, len(task.Containers[0].TransitionDependenciesMap))
}
Expand All @@ -533,7 +533,7 @@ func TestInitCgroupResourceSpecInvalidMem(t *testing.T) {
MemoryCPULimitsEnabled: true,
ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource),
}
assert.Error(t, task.initializeCgroupResourceSpec("", nil))
assert.Error(t, task.initializeCgroupResourceSpec("", time.Millisecond, nil))
assert.Equal(t, 0, len(task.GetResources()))
assert.Equal(t, 0, len(task.Containers[0].TransitionDependenciesMap))
}
Expand Down
3 changes: 2 additions & 1 deletion agent/api/task/task_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

const (
defaultCPUPeriod = 100 * time.Millisecond // 100ms

// With a 100ms CPU period, we can express 0.01 vCPU to 10 vCPUs
maxTaskVCPULimit = 10
// Reference: http://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
Expand All @@ -46,7 +47,7 @@ func (task *Task) adjustForPlatform(cfg *config.Config) {

func getCanonicalPath(path string) string { return path }

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
return nil
}

Expand Down
3 changes: 2 additions & 1 deletion agent/api/task/task_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"path/filepath"
"runtime"
"strings"
"time"

"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/taskresource"
Expand Down Expand Up @@ -120,6 +121,6 @@ func (task *Task) dockerCPUShares(containerCPU uint) int64 {
return int64(containerCPU)
}

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
return errors.New("unsupported platform")
}
6 changes: 6 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ const (

// DefaultNvidiaRuntime is the name of the runtime to pass Nvidia GPUs to containers
DefaultNvidiaRuntime = "nvidia"

// defaultCgroupCPUPeriod is set to 100 ms to set isCFS period and quota for task limits
defaultCgroupCPUPeriod = 100 * time.Millisecond
maximumCgroupCPUPeriod = 100 * time.Millisecond
minimumCgroupCPUPeriod = 8 * time.Millisecond
)

const (
Expand Down Expand Up @@ -539,6 +544,7 @@ func environmentConfig() (Config, error) {
GPUSupportEnabled: utils.ParseBool(os.Getenv("ECS_ENABLE_GPU_SUPPORT"), false),
NvidiaRuntime: os.Getenv("ECS_NVIDIA_RUNTIME"),
TaskMetadataAZDisabled: utils.ParseBool(os.Getenv("ECS_DISABLE_TASK_METADATA_AZ"), false),
CgroupCPUPeriod: parseCgroupCPUPeriod(),
}, err
}

Expand Down
3 changes: 3 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_NVIDIA_RUNTIME", "nvidia")()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "10s")()
defer setTestEnv("ECS_CGROUP_CPU_PERIOD", "")
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
setTestEnv("ECS_HOST_DATA_DIR", "/etc/ecs/")
setTestEnv("ECS_CGROUP_CPU_PERIOD", "10ms")

conf, err := environmentConfig()
assert.NoError(t, err)
Expand Down Expand Up @@ -163,6 +165,7 @@ func TestEnvironmentConfig(t *testing.T) {
assert.True(t, conf.GPUSupportEnabled, "Wrong value for GPUSupportEnabled")
assert.Equal(t, "nvidia", conf.NvidiaRuntime)
assert.True(t, conf.TaskMetadataAZDisabled, "Wrong value for TaskMetadataAZDisabled")
assert.Equal(t, 10*time.Millisecond, conf.CgroupCPUPeriod)
}

func TestTrimWhitespaceWhenCreating(t *testing.T) {
Expand Down
1 change: 1 addition & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ func DefaultConfig() Config {
PollMetrics: false,
PollingMetricsWaitDuration: DefaultPollingMetricsWaitDuration,
NvidiaRuntime: DefaultNvidiaRuntime,
CgroupCPUPeriod: defaultCgroupCPUPeriod,
}
}

Expand Down
43 changes: 43 additions & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, DefaultTaskMetadataBurstRate, cfg.TaskMetadataBurstRate,
"Default TaskMetadataBurstRate is set incorrectly")
assert.False(t, cfg.SharedVolumeMatchFullConfig, "Default SharedVolumeMatchFullConfig set incorrectly")
assert.Equal(t, defaultCgroupCPUPeriod, cfg.CgroupCPUPeriod, "CFS cpu period set incorrectly")
}

// TestConfigFromFile tests the configuration can be read from file
Expand Down Expand Up @@ -209,3 +210,45 @@ func TestEmptyNvidiaRuntime(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, DefaultNvidiaRuntime, cfg.NvidiaRuntime, "Wrong value for NvidiaRuntime")
}

func TestCPUPeriodSettings(t *testing.T) {
cases := []struct {
Name string
Env string
Response time.Duration
}{
{
Name: "OverrideDefaultCPUPeriod",
Env: "10ms",
Response: 10 * time.Millisecond,
},
{
Name: "DefaultCPUPeriod",
Env: "",
Response: defaultCgroupCPUPeriod,
},
{
Name: "TestCPUPeriodUpperBoundLimit",
Env: "110ms",
Response: defaultCgroupCPUPeriod,
},
{
Name: "TestCPUPeriodLowerBoundLimit",
Env: "7ms",
Response: defaultCgroupCPUPeriod,
},
}

for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
defer setTestRegion()()
defer os.Setenv("ECS_CGROUP_CPU_PERIOD", "100ms")

os.Setenv("ECS_CGROUP_CPU_PERIOD", c.Env)
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())

assert.NoError(t, err)
assert.Equal(t, c.Response, conf.CgroupCPUPeriod, "Wrong value for CgroupCPUPeriod")
})
}
}
13 changes: 13 additions & 0 deletions agent/config/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,16 @@ func parseImageCleanupExclusionList(envVar string) []string {
}
return imageCleanupExclusionList
}

func parseCgroupCPUPeriod() time.Duration {
duration := parseEnvVariableDuration("ECS_CGROUP_CPU_PERIOD")

if duration >= minimumCgroupCPUPeriod && duration <= maximumCgroupCPUPeriod {
return duration
} else if duration != 0 {
seelog.Warnf("CPU Period duration value: %v for Environment Variable ECS_CGROUP_CPU_PERIOD is not within [%v, %v], using default value instead",
duration, minimumCgroupCPUPeriod, maximumCgroupCPUPeriod)
}

return defaultCgroupCPUPeriod
}
3 changes: 3 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,7 @@ type Config struct {

// TaskMetadataAZDisabled specifies if availability zone should be disabled in Task Metadata endpoint
TaskMetadataAZDisabled bool

// CgroupCPUPeriod is config option to set different CFS quota and period values in microsecond, defaults to 100 ms
CgroupCPUPeriod time.Duration
}