Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Unified Cgroups (cgroups v2) #3127

Merged
merged 4 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions agent/api/task/task_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,34 @@ func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPerio
}

// BuildCgroupRoot helps build the task cgroup prefix
// Example: /ecs/task-id
// Example v1: /ecs/task-id
// Example v2: ECSTask-$TASKID.slice
func (task *Task) BuildCgroupRoot() (string, error) {
taskID, err := task.GetID()
if err != nil {
return "", errors.Wrapf(err, "task build cgroup root: unable to get task-id from task ARN: %s", task.Arn)
}

return filepath.Join(config.DefaultTaskCgroupPrefix, taskID), nil
if config.CgroupV2 {
return buildCgroupV2Root(taskID), nil
}
return buildCgroupV1Root(taskID), nil
}

func buildCgroupV1Root(taskID string) string {
return filepath.Join(config.DefaultTaskCgroupV1Prefix, taskID)
}

// buildCgroupV2Root creates a root cgroup using the systemd driver's special "-"
// character. The "-" specifies a parent slice, so tasks and their containers end up
// looking like this in the cgroup directory:
// /sys/fs/cgroup/ECSTasks.slice/
// ├── ECSTasks-XXXXf406f70c4c678073ae96944fXXXX.slice
// │ └── docker-XXXX7c6dc81f2e9a8bf1c566dc769733ccba594b3007dd289a0f50ad7923XXXX.scope
// └── ECSTasks-XXXX30467358463ab6bbba4e73afXXXX.slice
// └── docker-XXXX7ef4e942552437c96051356859c1df169f16e1cf9a9fc96fd30614e6XXXX.scope
func buildCgroupV2Root(taskID string) string {
return fmt.Sprintf("%s-%s.slice", config.DefaultTaskCgroupV2Prefix, taskID)
}

// BuildLinuxResourceSpec returns a linuxResources object for the task cgroup
Expand Down
10 changes: 10 additions & 0 deletions agent/api/task/task_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,16 @@ func TestBuildCgroupRootErrorPath(t *testing.T) {
assert.Empty(t, cgroupRoot)
}

func TestBuildCgroupV1Root(t *testing.T) {
cgroupRoot := buildCgroupV1Root("111mytaskid")
assert.Equal(t, "/ecs/111mytaskid", cgroupRoot)
}

func TestBuildCgroupV2Root(t *testing.T) {
cgroupRoot := buildCgroupV2Root("111mytaskid")
assert.Equal(t, "ECSTasks-111mytaskid.slice", cgroupRoot)
}

// TestBuildLinuxResourceSpecCPUMem validates the linux resource spec builder
func TestBuildLinuxResourceSpecCPUMem(t *testing.T) {
taskMemoryLimit := int64(taskMemoryLimit)
Expand Down
3 changes: 3 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ var (
// DefaultPauseContainerTag is the tag for the pause container image. The linker's load
// flags are used to populate this value from the Makefile
DefaultPauseContainerTag = ""

// CgroupV2 Specifies whether or not to run in Cgroups V2 mode.
CgroupV2 = false
)

// Merge merges two config files, preferring the ones on the left. Any nil or
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,12 @@
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package control
package config

import (
"github.com/aws/amazon-ecs-agent/agent/config"
import "github.com/containerd/cgroups"

"github.com/cihub/seelog"
specs "github.com/opencontainers/runtime-spec/specs-go"
)

// Init is used to setup the cgroup root for ecs
func (c *control) Init() error {
seelog.Infof("Creating root ecs cgroup: %s", config.DefaultTaskCgroupPrefix)

// Build cgroup spec
cgroupSpec := &Spec{
Root: config.DefaultTaskCgroupPrefix,
Specs: &specs.LinuxResources{},
func init() {
sparrc marked this conversation as resolved.
Show resolved Hide resolved
if cgroups.Mode() == cgroups.Unified {
CgroupV2 = true
sparrc marked this conversation as resolved.
Show resolved Hide resolved
}
_, err := c.Create(cgroupSpec)
return err
}
6 changes: 4 additions & 2 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ const (
// defaultRuntimeStatsLogFile stores the path where the golang runtime stats are periodically logged
defaultRuntimeStatsLogFile = `/log/agent-runtime-stats.log`

// DefaultTaskCgroupPrefix is default cgroup prefix for ECS tasks
DefaultTaskCgroupPrefix = "/ecs"
// DefaultTaskCgroupV1Prefix is default cgroup v1 prefix for ECS tasks
DefaultTaskCgroupV1Prefix = "/ecs"
// DefaultTaskCgroupV2Prefix is default cgroup v2 prefix for ECS tasks
sparrc marked this conversation as resolved.
Show resolved Hide resolved
DefaultTaskCgroupV2Prefix = "ECSTasks"

// Default cgroup memory system root path, this is the default used if the
// path has not been configured through ECS_CGROUP_PATH
Expand Down
6 changes: 3 additions & 3 deletions agent/engine/docker_task_engine_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func TestResourceContainerProgression(t *testing.T) {
gomock.InOrder(
// Ensure that the resource is created first
mockControl.EXPECT().Exists(gomock.Any()).Return(false),
mockControl.EXPECT().Create(gomock.Any()).Return(nil, nil),
mockControl.EXPECT().Create(gomock.Any()).Return(nil),
mockIO.EXPECT().WriteFile(cgroupMemoryPath, gomock.Any(), gomock.Any()).Return(nil),
imageManager.EXPECT().AddAllImageStates(gomock.Any()).AnyTimes(),
client.EXPECT().PullImage(gomock.Any(), sleepContainer.Image, nil, gomock.Any()).Return(dockerapi.DockerContainerMetadata{}),
Expand Down Expand Up @@ -265,7 +265,7 @@ func TestResourceContainerProgressionFailure(t *testing.T) {
gomock.InOrder(
// resource creation failure
mockControl.EXPECT().Exists(gomock.Any()).Return(false),
mockControl.EXPECT().Create(gomock.Any()).Return(nil, errors.New("cgroup create error")),
mockControl.EXPECT().Create(gomock.Any()).Return(errors.New("cgroup create error")),
)
mockTime.EXPECT().Now().Return(time.Now()).AnyTimes()

Expand Down Expand Up @@ -348,7 +348,7 @@ func TestTaskCPULimitHappyPath(t *testing.T) {
},
}
mockControl.EXPECT().Exists(gomock.Any()).Return(false)
mockControl.EXPECT().Create(gomock.Any()).Return(nil, nil)
mockControl.EXPECT().Create(gomock.Any()).Return(nil)
mockIO.EXPECT().WriteFile(cgroupMemoryPath, gomock.Any(), gomock.Any()).Return(nil)
}

Expand Down
6 changes: 6 additions & 0 deletions agent/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ github.com/buger/jsonparser v0.0.0-20180808090653-f4dd9f5a6b44/go.mod h1:bbYlZJ7
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 h1:kHaBemcxl8o/pQ5VM1c8PVE1PubbNx3mjUr09OqWGCs=
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575/go.mod h1:9d6lWj8KzO/fd/NrVaLscBKmPigpZpn5YawRPw+e3Yo=
github.com/cilium/ebpf v0.4.0 h1:QlHdikaxALkqWasW8hAC1mfR0jdmvbfaBdBPFmRSglA=
github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
Expand Down Expand Up @@ -62,6 +63,7 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
Expand Down Expand Up @@ -102,8 +104,10 @@ github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGi
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o=
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
Expand Down Expand Up @@ -156,13 +160,15 @@ github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln
github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU=
go.etcd.io/bbolt v1.3.6/go.mod h1:qXsaaIqmgQH0T+OPdb99Bf+PKfBBQVAdyD6TY9G8XM4=
go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA=
go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ=
golang.org/x/crypto v0.0.0-20171113213409-9f005a07e0d3 h1:f4/ZD59VsBOaJmWeI2yqtHvJhmRRPzi73C88ZtfhAIk=
golang.org/x/crypto v0.0.0-20171113213409-9f005a07e0d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/net v0.0.0-20191204025024-5ee1b9f4859a h1:+HHJiFUXVOIS9mr1ThqkQD1N8vpFCfCShqADBM12KTc=
golang.org/x/net v0.0.0-20191204025024-5ee1b9f4859a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
Expand Down
5 changes: 2 additions & 3 deletions agent/stats/utils_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@ func dockerStatsToContainerStats(dockerStats *types.StatsJSON) (*ContainerStats,
}

func validateDockerStats(dockerStats *types.StatsJSON) error {
// The length of PercpuUsage represents the number of cores in an instance.
if len(dockerStats.CPUStats.CPUUsage.PercpuUsage) == 0 || numCores == uint64(0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't quite figure out why we decided to check len(dockerStats.CPUStats.CPUUsage.PercpuUsage) == 0 in the first place, but looks like we don't reference this array anywhere else, and using runtime.NumCPU is good enough for me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, we think that len(dockerStats.CPUStats.CPUUsage.PercpuUsage) == 0 was redundant or a mistake? I'm hesitant to remove something we don't seem to have clarity on why it was introduced.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PercpuUsage doesn't appear to be used anywhere in agent, and it's not available anymore with cgroupv2. I'm not sure how else we can check that it's safe to remove? FWIW there isn't really any alternative to just removing the check from what I can tell, since cgroupv2 stats simply dont provide this level of granularity.

If you would prefer we could just have an if-statement here and continue checking this array when we're using cgroupv1?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing the intention was to do a simple sanity check of the stats API.
There is an alternative to the PercpuUsage field, online_cpus, according to docker API doc, which should be available when either v1 or v2 cgroups is used:

number_cpus = lenght(cpu_stats.cpu_usage.percpu_usage) or cpu_stats.online_cpus

But then I also saw this comment

If either precpu_stats.online_cpus or cpu_stats.online_cpus is nil then for compatibility with older daemons the length of the corresponding cpu_usage.percpu_usage array should be used.

Earliest docker API reference I could find is for 1.18 and it does not contain the online_cpu field.

Since ECSAgent-supported docker api version can go back to 1.17, which can still work with 20.10 docker engine (based on this doc) which is when docker introduced cgroupv2, it's not 100% safe for us to check online_cpus when cgroupv2 is used.

We can still keep the v1 check, but I'm not sure of a safe way to validate docker stats numCPU when using cgroupv2.

return fmt.Errorf("invalid container statistics reported, no cpu core usage reported")
if numCores == uint64(0) {
return fmt.Errorf("invalid number of cores returned from runtime.NumCPU, numCores=0")
}
return nil
}
Expand Down
7 changes: 4 additions & 3 deletions agent/stats/utils_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ func TestDockerStatsToContainerStatsEmptyCpuUsageGeneratesError(t *testing.T) {
jsonBytes, _ := ioutil.ReadFile(inputJsonFile)
dockerStat := &types.StatsJSON{}
json.Unmarshal([]byte(jsonBytes), dockerStat)
// empty the PercpuUsage array
dockerStat.CPUStats.CPUUsage.PercpuUsage = make([]uint64, 0)
prevNumCores := numCores
numCores = uint64(0)
err := validateDockerStats(dockerStat)
assert.Error(t, err, "expected error converting container stats with empty PercpuUsage")
assert.Error(t, err, "expected error converting container stats with numCores=0")
numCores = prevNumCores
}
29 changes: 17 additions & 12 deletions agent/taskresource/cgroup/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container"
apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status"
"github.com/aws/amazon-ecs-agent/agent/api/task/status"
"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/taskresource"
control "github.com/aws/amazon-ecs-agent/agent/taskresource/cgroup/control"
resourcestatus "github.com/aws/amazon-ecs-agent/agent/taskresource/status"
Expand Down Expand Up @@ -71,12 +72,14 @@ type CgroupResource struct {
}

// NewCgroupResource is used to return an object that implements the Resource interface
func NewCgroupResource(taskARN string,
func NewCgroupResource(
taskARN string,
control control.Control,
ioutil ioutilwrapper.IOUtil,
cgroupRoot string,
cgroupMountPath string,
resourceSpec specs.LinuxResources) *CgroupResource {
resourceSpec specs.LinuxResources,
) *CgroupResource {
c := &CgroupResource{
taskARN: taskARN,
control: control,
Expand Down Expand Up @@ -256,18 +259,17 @@ func (cgroup *CgroupResource) GetCreatedAt() time.Time {
func (cgroup *CgroupResource) Create() error {
err := cgroup.setupTaskCgroup()
if err != nil {
seelog.Criticalf("Cgroup resource [%s]: unable to setup cgroup root: %v", cgroup.taskARN, err)
// this error is already formatted in setupTaskCgroup function
return err
}
return nil
}

func (cgroup *CgroupResource) setupTaskCgroup() error {
cgroupRoot := cgroup.cgroupRoot
seelog.Debugf("Cgroup resource [%s]: setting up cgroup at: %s", cgroup.taskARN, cgroupRoot)

if cgroup.control.Exists(cgroupRoot) {
seelog.Debugf("Cgroup resource [%s]: cgroup at %s already exists, skipping creation", cgroup.taskARN, cgroupRoot)
seelog.Debugf("Cgroup already exists, skipping creation taskARN=%s cgroupPath=%s cgroupV2=%v", cgroup.taskARN, cgroupRoot, config.CgroupV2)
return nil
}

Expand All @@ -276,16 +278,19 @@ func (cgroup *CgroupResource) setupTaskCgroup() error {
Specs: &cgroup.resourceSpec,
}

_, err := cgroup.control.Create(&cgroupSpec)
seelog.Infof("Creating task cgroup taskARN=%s cgroupPath=%s cgroupV2=%v", cgroup.taskARN, cgroupRoot, config.CgroupV2)
err := cgroup.control.Create(&cgroupSpec)
if err != nil {
return fmt.Errorf("cgroup resource [%s]: setup cgroup: unable to create cgroup at %s: %w", cgroup.taskARN, cgroupRoot, err)
return fmt.Errorf("cgroup resource: setup cgroup: unable to create cgroup taskARN=%s cgroupPath=%s cgroupV2=%v err=%s", cgroup.taskARN, cgroupRoot, config.CgroupV2, err)
}

// enabling cgroup memory hierarchy by doing 'echo 1 > memory.use_hierarchy'
memoryHierarchyPath := filepath.Join(cgroup.cgroupMountPath, memorySubsystem, cgroupRoot, memoryUseHierarchy)
err = cgroup.ioutil.WriteFile(memoryHierarchyPath, enableMemoryHierarchy, rootReadOnlyPermissions)
if err != nil {
return fmt.Errorf("cgroup resource [%s]: setup cgroup: unable to set use hierarchy flag: %w", cgroup.taskARN, err)
if !config.CgroupV2 {
// enabling cgroup memory hierarchy by doing 'echo 1 > memory.use_hierarchy'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was looking into this memory.use_hierarchy flag, some information I found on kernel doc (see 6. Hierarchy support). So this basically enables recursive accounting and reclaiming - but I couldn't find the cgroupv2 equivalent (if any). Were we able to verify that container memory consumption gets charged to task cgroup as well?

Copy link
Contributor Author

@sparrc sparrc Feb 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding was that since cgroups v2 is by default a "unified hierarchy", this flag no longer applies. From my testing this seems to be the case because limits set on a task slice with two containers are applied to both containers in aggregate.

Copy link
Contributor

@yinyic yinyic Feb 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think applying this flag in v1 will charge subcgroup memory usage to its parent (bottom-up). The allocation (top-down) does not get affected with or without the flag.
This is my understanding: if a task cgroup has two containers, task has hard limit 100MB with each container getting 50MB. Now one of the two containers attempts to allocate 200MB

  • without the flag, only the container will be killed
  • with the flag, because the memory allocation got charged to the task, the task will also be oom-killed.

Can we verify if that's the behavior?

And tbh, I'm not entirely sure why we want to enable this hierarchical charging. If a non-essential container gets killed, we should still let the task run...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if a task cgroup has two containers, task has hard limit 100MB with each container getting 50MB

I can test this but I'm fairly certain that it works that both containers have a shared limit of 100MB. So if one container is using only 20MB the other can use up to 80MB.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested the scenario of a task with 512MB of total memory, with one container allocating 100MB and another 450MB. This does seem to be correct, ie...

without the flag, only the container will be killed

On AL2022 only the container allocating 450MB is being OOM-killed. I'm not sure how exactly does it decide which container to kill, it might just be that it kills whichever container breaches the memory limit first?

with the flag, because the memory allocation got charged to the task, the task will also be oom-killed.

Yes I confirmed that on AL2, when one container hits the limit, BOTH containers are killed by the OOM-killer.

Will investigate how we can turn on use_hierarchy on cgroupv2...

Copy link
Contributor

@yinyic yinyic Feb 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will investigate how we can turn on use_hierarchy on cgroupv2...

I would actually suggest that we turn off use_hierarchy on cgroupv1.

If I understanding it correctly, the only time that one container failure should bring down the whole task is when that container is the essential container. On the other hand, if a container is non-essential, it being OOM killed (or terminated in any other way for that matter) should not affect the other containers in the task.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To follow up on this, use_hierarchy functionality is not completely available on cgroup v2. The memory.oom.group flag is close (see https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), but it causes the entire container to be killed, rather than just the process within the container. We will go forward with the kernel default behavior of only killing the "bulkiest" process when the task-level memory limit is reached.

memoryHierarchyPath := filepath.Join(cgroup.cgroupMountPath, memorySubsystem, cgroupRoot, memoryUseHierarchy)
err = cgroup.ioutil.WriteFile(memoryHierarchyPath, enableMemoryHierarchy, rootReadOnlyPermissions)
if err != nil {
return fmt.Errorf("cgroup resource: setup cgroup: unable to set use hierarchy flag taskARN=%s cgroupPath=%s cgroupV2=%v err=%s", cgroup.taskARN, cgroupRoot, config.CgroupV2, err)
}
}

return nil
Expand Down
6 changes: 2 additions & 4 deletions agent/taskresource/cgroup/cgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"time"

cgroup "github.com/aws/amazon-ecs-agent/agent/taskresource/cgroup/control"
mock_cgroups "github.com/aws/amazon-ecs-agent/agent/taskresource/cgroup/control/factory/mock"
"github.com/aws/amazon-ecs-agent/agent/taskresource/cgroup/control/mock_control"
resourcestatus "github.com/aws/amazon-ecs-agent/agent/taskresource/status"
mock_ioutilwrapper "github.com/aws/amazon-ecs-agent/agent/utils/ioutilwrapper/mocks"
Expand Down Expand Up @@ -53,7 +52,7 @@ func TestCreateHappyPath(t *testing.T) {

gomock.InOrder(
mockControl.EXPECT().Exists(gomock.Any()).Return(false),
mockControl.EXPECT().Create(gomock.Any()).Return(nil, nil),
mockControl.EXPECT().Create(gomock.Any()).Return(nil),
mockIO.EXPECT().WriteFile(cgroupMemoryPath, gomock.Any(), gomock.Any()).Return(nil),
)
cgroupResource := NewCgroupResource("taskArn", mockControl, mockIO, cgroupRoot, cgroupMountPath, specs.LinuxResources{})
Expand Down Expand Up @@ -83,13 +82,12 @@ func TestCreateCgroupError(t *testing.T) {

mockControl := mock_control.NewMockControl(ctrl)
mockIO := mock_ioutilwrapper.NewMockIOUtil(ctrl)
mockCgroup := mock_cgroups.NewMockCgroup(ctrl)

cgroupRoot := fmt.Sprintf("/ecs/%s", taskID)

gomock.InOrder(
mockControl.EXPECT().Exists(gomock.Any()).Return(false),
mockControl.EXPECT().Create(gomock.Any()).Return(mockCgroup, errors.New("cgroup create error")),
mockControl.EXPECT().Create(gomock.Any()).Return(errors.New("cgroup create error")),
)

cgroupResource := NewCgroupResource("taskArn", mockControl, mockIO, cgroupRoot, cgroupMountPath, specs.LinuxResources{})
Expand Down
36 changes: 26 additions & 10 deletions agent/taskresource/cgroup/control/cgroup_controller_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package control
import (
"fmt"

"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/taskresource/cgroup/control/factory"
specs "github.com/opencontainers/runtime-spec/specs-go"

"github.com/cihub/seelog"
"github.com/containerd/cgroups"
Expand All @@ -32,6 +34,9 @@ type control struct {

// New is used to obtain a new cgroup control object
func New() Control {
if config.CgroupV2 {
return &controlv2{}
}
return newControl(&factory.GlobalCgroupFactory{})
}

Expand All @@ -43,27 +48,25 @@ func newControl(cgroupFact factory.CgroupFactory) Control {
}

// Create creates a new cgroup based off the spec post validation
func (c *control) Create(cgroupSpec *Spec) (cgroups.Cgroup, error) {
func (c *control) Create(cgroupSpec *Spec) error {
// Validate incoming spec
err := validateCgroupSpec(cgroupSpec)
if err != nil {
return nil, fmt.Errorf("cgroup create: failed to validate spec: %w", err)
return fmt.Errorf("cgroup create: failed to validate spec: %w", err)
}

// Create cgroup
seelog.Infof("Creating cgroup %s", cgroupSpec.Root)
controller, err := c.New(cgroups.V1, cgroups.StaticPath(cgroupSpec.Root), cgroupSpec.Specs)

seelog.Debugf("Creating cgroup cgroupPath=%s", cgroupSpec.Root)
_, err = c.New(cgroups.V1, cgroups.StaticPath(cgroupSpec.Root), cgroupSpec.Specs)
if err != nil {
return nil, fmt.Errorf("cgroup create: unable to create controller: %w", err)
return fmt.Errorf("cgroup create: unable to create controller: v1: %s", err)
}

return controller, nil
return nil
}

// Remove is used to delete the cgroup
func (c *control) Remove(cgroupPath string) error {
seelog.Debugf("Removing cgroup %s", cgroupPath)
seelog.Debugf("Removing cgroup cgroupPath=%s", cgroupPath)

controller, err := c.Load(cgroups.V1, cgroups.StaticPath(cgroupPath))
if err != nil {
Expand All @@ -81,7 +84,7 @@ func (c *control) Remove(cgroupPath string) error {

// Exists is used to verify the existence of a cgroup
func (c *control) Exists(cgroupPath string) bool {
seelog.Debugf("Checking existence of cgroup: %s", cgroupPath)
seelog.Debugf("Checking existence of cgroup cgroupPath=%s", cgroupPath)

controller, err := c.Load(cgroups.V1, cgroups.StaticPath(cgroupPath))
if err != nil || controller == nil {
Expand All @@ -91,6 +94,19 @@ func (c *control) Exists(cgroupPath string) bool {
return true
}

// Init is used to setup the cgroup root for ecs
func (c *control) Init() error {
seelog.Debugf("Creating root ecs cgroup cgroupPath=%s", config.DefaultTaskCgroupV1Prefix)

// Build cgroup spec
cgroupSpec := &Spec{
Root: config.DefaultTaskCgroupV1Prefix,
Specs: &specs.LinuxResources{},
}
err := c.Create(cgroupSpec)
return err
}

// validateCgroupSpec checks the cgroup spec for valid path and specifications
func validateCgroupSpec(cgroupSpec *Spec) error {
if cgroupSpec == nil {
Expand Down
Loading