Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge metrics branch to dev #2072

Merged
merged 29 commits into from
Jun 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
958fe51
tcs: Update tcs model for network and storage stats
sharanyad Apr 25, 2019
64ade5f
add overflow field for int64
sharanyad Apr 30, 2019
94cc93d
Merge branch 'dev' into new-metrics
sharanyad Apr 30, 2019
d09abb5
Merge pull request #2011 from sharanyad/new-metrics
sharanyad Apr 30, 2019
0252995
adds StorageReadBytes and StorageWriteBytes to stats collection
fierlion Apr 30, 2019
f3827a6
stats: add network stats to container metrics
sharanyad Apr 30, 2019
2b02a4a
remove network stats addition to task metrics
sharanyad May 3, 2019
c970da1
add StorageStatsSet and windows stats collection
fierlion May 3, 2019
d1beea4
add nil check for iobytes
fierlion May 7, 2019
ef394ae
add container name to metrics
sharanyad May 9, 2019
131040f
stats: add container name to stats container
sharanyad May 9, 2019
23fc49f
Merge branch 'dev' into new-metrics
sharanyad May 9, 2019
744c1ea
Merge pull request #2033 from sharanyad/new-metrics
sharanyad May 10, 2019
df47b1b
stats: add network and storage stats to container metrics
sharanyad May 15, 2019
f30c90f
stats:add container's name to container metrics
sharanyad May 16, 2019
b2fe45e
stats: empty network stats for none,host and awsvpc mode
sharanyad May 17, 2019
f07a9eb
network stats integration tests
sharanyad May 21, 2019
b7b8b57
Merge branch 'dev' into network-disk-metrics
sharanyad May 22, 2019
39088a3
Merge pull request #2045 from sharanyad/new-metrics
sharanyad May 22, 2019
fd1492f
updated with storage stats check
fierlion May 23, 2019
538c163
add storage-stats task for functional testing
fierlion May 24, 2019
24f4809
Merge branch 'dev' into network-disk-metrics
sharanyad Jun 3, 2019
d0ff62a
Merge pull request #2062 from sharanyad/network-disk-metrics
sharanyad Jun 4, 2019
a88a90d
Merge branch 'dev' into network-disk-metrics
sharanyad Jun 5, 2019
6562140
Merge pull request #2067 from sharanyad/new-metrics
sharanyad Jun 5, 2019
add8792
update read/write storage-stats functional task
fierlion Jun 5, 2019
ecc9233
stats:remove unwanted debug log
sharanyad Jun 5, 2019
04c4ee0
Merge remote-tracking branch 'fierlion/updateReadWrite' into network-…
fierlion Jun 5, 2019
71d2a09
Merge branch 'network-disk-metrics' into dev
sharanyad Jun 5, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,13 @@ test-in-docker:
# Privileged needed for docker-in-docker so integ tests pass
docker run --net=none -v "$(PWD):/go/src/github.com/aws/amazon-ecs-agent" --privileged "amazon/amazon-ecs-agent-test:make"

run-functional-tests: testnnp test-registry ecr-execution-role-image telemetry-test-image
run-functional-tests: testnnp test-registry ecr-execution-role-image telemetry-test-image storage-stats-test-image
. ./scripts/shared_env && go test -tags functional -timeout=60m -v ./agent/functional_tests/...

.PHONY: build-image-for-ecr ecr-execution-role-image-for-upload upload-images replicate-images

build-image-for-ecr: netkitten volumes-test squid awscli image-cleanup-test-images fluentd taskmetadata-validator \
testnnp container-health-check-image telemetry-test-image ecr-execution-role-image-for-upload
testnnp container-health-check-image telemetry-test-image storage-stats-test-image ecr-execution-role-image-for-upload

ecr-execution-role-image-for-upload:
$(MAKE) -C misc/ecr-execution-role-upload $(MFLAGS)
Expand Down Expand Up @@ -325,7 +325,8 @@ namespace-tests:

# TODO, replace this with a build on dockerhub or a mechanism for the
# functional tests themselves to build this
.PHONY: squid awscli fluentd gremlin agent-introspection-validator taskmetadata-validator v3-task-endpoint-validator container-metadata-file-validator elastic-inference-validator image-cleanup-test-images ecr-execution-role-image container-health-check-image telemetry-test-image
.PHONY: squid awscli fluentd gremlin agent-introspection-validator taskmetadata-validator v3-task-endpoint-validator container-metadata-file-validator elastic-inference-validator image-cleanup-test-images ecr-execution-role-image container-health-check-image telemetry-test-image storage-stats-test-image

squid:
$(MAKE) -C misc/squid $(MFLAGS)

Expand Down Expand Up @@ -368,6 +369,9 @@ ecr-execution-role-image:
telemetry-test-image:
$(MAKE) -C misc/telemetry $(MFLAGS)

storage-stats-test-image:
$(MAKE) -C misc/storage-stats $(MFLAGS)

container-health-check-image:
$(MAKE) -C misc/container-health $(MFLAGS)

Expand Down Expand Up @@ -442,6 +446,7 @@ clean:
-$(MAKE) -C misc/elastic-inference-validator $(MFLAGS) clean
-$(MAKE) -C misc/container-health $(MFLAGS) clean
-$(MAKE) -C misc/telemetry $(MFLAGS) clean
-$(MAKE) -C misc/storage-stats $(MFLAGS) clean
-$(MAKE) -C misc/appmesh-plugin-validator $(MFLAGS) clean
-$(MAKE) -C misc/eni-trunking-validator $(MFLAGS) clean
-rm -f .get-deps-stamp
Expand Down
29 changes: 25 additions & 4 deletions agent/stats/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ func eventStream(name string) *eventstream.EventStream {

// createGremlin creates the gremlin container using the docker client.
// It is used only in the test code.
func createGremlin(client *sdkClient.Client) (*dockercontainer.ContainerCreateCreatedBody, error) {
func createGremlin(client *sdkClient.Client, netMode string) (*dockercontainer.ContainerCreateCreatedBody, error) {
containerGremlin, err := client.ContainerCreate(context.TODO(),
&dockercontainer.Config{
Image: testImageName,
},
&dockercontainer.HostConfig{},
&dockercontainer.HostConfig{
NetworkMode: dockercontainer.NetworkMode(netMode),
},
&network.NetworkingConfig{},
"")

Expand Down Expand Up @@ -148,12 +150,21 @@ func validateContainerMetrics(containerMetrics []*ecstcs.ContainerMetric, expect
return fmt.Errorf("Mismatch in number of ContainerStatsSet elements. Expected: %d, Got: %d", expected, len(containerMetrics))
}
for _, containerMetric := range containerMetrics {
if *containerMetric.ContainerName == "" {
return fmt.Errorf("ContainerName is empty")
}
if containerMetric.CpuStatsSet == nil {
return fmt.Errorf("CPUStatsSet is nil")
}
if containerMetric.MemoryStatsSet == nil {
return fmt.Errorf("MemoryStatsSet is nil")
}
if containerMetric.NetworkStatsSet == nil {
return fmt.Errorf("NetworkStatsSet is nil")
}
if containerMetric.StorageStatsSet == nil {
return fmt.Errorf("StorageStatsSet is nil")
}
}
return nil
}
Expand Down Expand Up @@ -247,9 +258,19 @@ func validateEmptyTaskHealthMetrics(t *testing.T, engine *DockerStatsEngine) {
}

func createFakeContainerStats() []*ContainerStats {
netStats := &NetworkStats{
RxBytes: 796,
RxDropped: 6,
RxErrors: 0,
RxPackets: 10,
TxBytes: 8192,
TxDropped: 5,
TxErrors: 0,
TxPackets: 60,
}
return []*ContainerStats{
{22400432, 1839104, parseNanoTime("2015-02-12T21:22:05.131117533Z")},
{116499979, 3649536, parseNanoTime("2015-02-12T21:22:05.232291187Z")},
{22400432, 1839104, uint64(100), uint64(200), netStats, parseNanoTime("2015-02-12T21:22:05.131117533Z")},
{116499979, 3649536, uint64(300), uint64(400), netStats, parseNanoTime("2015-02-12T21:22:05.232291187Z")},
}
}

Expand Down
12 changes: 9 additions & 3 deletions agent/stats/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,23 @@ const (
ContainerStatsBufferLength = 120
)

func newStatsContainer(dockerID string, client dockerapi.DockerClient, resolver resolver.ContainerMetadataResolver) *StatsContainer {
func newStatsContainer(dockerID string, client dockerapi.DockerClient, resolver resolver.ContainerMetadataResolver) (*StatsContainer, error) {
dockerContainer, err := resolver.ResolveContainer(dockerID)
if err != nil {
return nil, err
}
ctx, cancel := context.WithCancel(context.Background())
return &StatsContainer{
containerMetadata: &ContainerMetadata{
DockerID: dockerID,
DockerID: dockerID,
Name: dockerContainer.Container.Name,
NetworkMode: dockerContainer.Container.GetNetworkMode(),
},
ctx: ctx,
cancel: cancel,
client: client,
resolver: resolver,
}
}, nil
}

func (container *StatsContainer) StartStatsCollection() {
Expand Down
36 changes: 32 additions & 4 deletions agent/stats/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ import (
const (
containerChangeHandler = "DockerStatsEngineDockerEventsHandler"
queueResetThreshold = 2 * dockerclient.StatsInactivityTimeout
hostNetworkMode = "host"
noneNetworkMode = "none"
)

var (
Expand Down Expand Up @@ -246,8 +248,12 @@ func (engine *DockerStatsEngine) addContainerUnsafe(dockerID string) (*StatsCont
return nil, errors.Errorf("stats add container: task is terminal, ignoring container: %s, task: %s", dockerID, task.Arn)
}

statsContainer, err := newStatsContainer(dockerID, engine.client, engine.resolver)
if err != nil {
return nil, errors.Wrapf(err, "could not map docker container ID to container, ignoring container: %s", dockerID)
}

seelog.Debugf("Adding container to stats watch list, id: %s, task: %s", dockerID, task.Arn)
statsContainer := newStatsContainer(dockerID, engine.client, engine.resolver)
engine.tasksToDefinitions[task.Arn] = &taskDefinition{family: task.Family, version: task.Version}

watchStatsContainer := false
Expand Down Expand Up @@ -594,18 +600,40 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
continue
}

// Get memory stats set.
memoryStatsSet, err := container.statsQueue.GetMemoryStatsSet()
if err != nil {
seelog.Warnf("Error getting memory stats, err: %v, container: %v", err, dockerID)
continue
}

containerMetrics = append(containerMetrics, &ecstcs.ContainerMetric{
containerMetric := &ecstcs.ContainerMetric{
ContainerName: &container.containerMetadata.Name,
CpuStatsSet: cpuStatsSet,
MemoryStatsSet: memoryStatsSet,
})
}

task, err := engine.resolver.ResolveTask(dockerID)
if err != nil {
seelog.Warnf("Task not found for container ID: %s", dockerID)
} else {
// send network stats for default/bridge/nat network modes
if task.ENI == nil && container.containerMetadata.NetworkMode != hostNetworkMode && container.containerMetadata.NetworkMode != noneNetworkMode {
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
if err != nil {
// we log the error and still continue to publish cpu, memory stats
seelog.Warnf("Error getting network stats: %v, container: %v", err, dockerID)
}
containerMetric.NetworkStatsSet = networkStatsSet
}
}

storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
if err != nil {
seelog.Warnf("Error getting storage stats, err: %v, container: %v", err, dockerID)
}
containerMetric.StorageStatsSet = storageStatsSet

containerMetrics = append(containerMetrics, containerMetric)
}

return containerMetrics, nil
Expand Down
167 changes: 164 additions & 3 deletions agent/stats/engine_integ_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func TestStatsEngineWithExistingContainersWithoutHealth(t *testing.T) {
timeout := defaultDockerTimeoutSeconds

// Create a container to get the container id.
container, err := createGremlin(client)
container, err := createGremlin(client, "default")
require.NoError(t, err, "creating container failed")
defer client.ContainerRemove(ctx, container.ID, types.ContainerRemoveOptions{Force: true})

Expand Down Expand Up @@ -128,7 +128,7 @@ func TestStatsEngineWithNewContainersWithoutHealth(t *testing.T) {
// Assign ContainerStop timeout to addressable variable
timeout := defaultDockerTimeoutSeconds

container, err := createGremlin(client)
container, err := createGremlin(client, "default")
require.NoError(t, err, "creating container failed")
defer client.ContainerRemove(ctx, container.ID, types.ContainerRemoveOptions{Force: true})

Expand Down Expand Up @@ -345,7 +345,7 @@ func TestStatsEngineWithNewContainersWithPolling(t *testing.T) {
cfg.PollMetrics = true
cfg.PollingMetricsWaitDuration = 1 * time.Second
// Create a new docker client with new config
dockerClientForNewContainersWithPolling, _ := dockerapi.NewDockerGoClient(sdkClientFactory, &cfg, ctx)
dockerClientForNewContainersWithPolling, _ := dockerapi.NewDockerGoClient(sdkClientFactory, &cfg, ctx)
// Create a new docker stats engine
engine := NewDockerStatsEngine(&cfg, dockerClientForNewContainersWithPolling, eventStream("TestStatsEngineWithNewContainers"))
defer engine.removeAll()
Expand Down Expand Up @@ -574,3 +574,164 @@ func TestStatsEngineWithDockerTaskEngineMissingRemoveEvent(t *testing.T) {
validateIdleContainerMetrics(t, statsEngine)
validateEmptyTaskHealthMetrics(t, statsEngine)
}

func TestStatsEngineWithNetworkStatsDefaultMode(t *testing.T) {
testNetworkModeStats(t, "default", false)
}

func testNetworkModeStats(t *testing.T, networkMode string, statsEmpty bool) {
// Create a new docker stats engine
engine := NewDockerStatsEngine(&cfg, dockerClient, eventStream("TestStatsEngineWithNetworkStats"))
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()

// Assign ContainerStop timeout to addressable variable
timeout := defaultDockerTimeoutSeconds

// Create a container to get the container id.
container, err := createGremlin(client, networkMode)
require.NoError(t, err, "creating container failed")
defer client.ContainerRemove(ctx, container.ID, types.ContainerRemoveOptions{Force: true})

engine.cluster = defaultCluster
engine.containerInstanceArn = defaultContainerInstance

err = client.ContainerStart(ctx, container.ID, types.ContainerStartOptions{})
require.NoError(t, err, "starting container failed")
defer client.ContainerStop(ctx, container.ID, &timeout)

containerChangeEventStream := eventStream("TestStatsEngineWithNetworkStats")
taskEngine := ecsengine.NewTaskEngine(&config.Config{}, nil, nil, containerChangeEventStream,
nil, dockerstate.NewTaskEngineState(), nil, nil)
testTask := createRunningTask()

// Populate Tasks and Container map in the engine.
dockerTaskEngine := taskEngine.(*ecsengine.DockerTaskEngine)
dockerTaskEngine.State().AddTask(testTask)
dockerTaskEngine.State().AddContainer(
&apicontainer.DockerContainer{
DockerID: container.ID,
DockerName: "gremlin",
Container: testTask.Containers[0],
},
testTask)

// Inspect the container and populate the container's network mode
// This is done as part of Task Engine
// https://github.com/aws/amazon-ecs-agent/blob/d2456beb048d36bfe18159ad7f35ca6b78bb9ee9/agent/engine/docker_task_engine.go#L364
dockerContainer, err := client.ContainerInspect(ctx, container.ID)
require.NoError(t, err, "inspecting container failed")
netMode := string(dockerContainer.HostConfig.NetworkMode)
testTask.Containers[0].SetNetworkMode(netMode)

// Simulate container start prior to listener initialization.
time.Sleep(checkPointSleep)
err = engine.MustInit(ctx, taskEngine, defaultCluster, defaultContainerInstance)
require.NoError(t, err, "initializing stats engine failed")
defer engine.containerChangeEventStream.Unsubscribe(containerChangeHandler)

// Wait for the stats collection go routine to start.
time.Sleep(checkPointSleep)
_, taskMetrics, err := engine.GetInstanceMetrics()
assert.NoError(t, err, "getting instance metrics failed")
taskMetric := taskMetrics[0]
for _, containerMetric := range taskMetric.ContainerMetrics {
if statsEmpty {
assert.Nil(t, containerMetric.NetworkStatsSet, "network stats should be empty for %s network mode", networkMode)
} else {
assert.NotNil(t, containerMetric.NetworkStatsSet, "network stats should be non-empty for %s network mode", networkMode)
}
}

err = client.ContainerStop(ctx, container.ID, &timeout)
require.NoError(t, err, "stopping container failed")

err = engine.containerChangeEventStream.WriteToEventStream(dockerapi.DockerContainerChangeEvent{
Status: apicontainerstatus.ContainerStopped,
DockerContainerMetadata: dockerapi.DockerContainerMetadata{
DockerID: container.ID,
},
})
assert.NoError(t, err, "failed to write to container change event stream")
time.Sleep(waitForCleanupSleep)

// Should not contain any metrics after cleanup.
validateIdleContainerMetrics(t, engine)
validateEmptyTaskHealthMetrics(t, engine)
}

func TestStorageStats(t *testing.T) {
// Create a new docker stats engine
engine := NewDockerStatsEngine(&cfg, dockerClient, eventStream("TestStatsEngineWithStorageStats"))
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()

// Assign ContainerStop timeout to addressable variable
timeout := defaultDockerTimeoutSeconds

// Create a container to get the container id.
container, err := createGremlin(client, "default")
require.NoError(t, err, "creating container failed")
defer client.ContainerRemove(ctx, container.ID, types.ContainerRemoveOptions{Force: true})

engine.cluster = defaultCluster
engine.containerInstanceArn = defaultContainerInstance

err = client.ContainerStart(ctx, container.ID, types.ContainerStartOptions{})
require.NoError(t, err, "starting container failed")
defer client.ContainerStop(ctx, container.ID, &timeout)

containerChangeEventStream := eventStream("TestStatsEngineWithStorageStats")
taskEngine := ecsengine.NewTaskEngine(&config.Config{}, nil, nil, containerChangeEventStream,
nil, dockerstate.NewTaskEngineState(), nil, nil)
testTask := createRunningTask()

// Populate Tasks and Container map in the engine.
dockerTaskEngine := taskEngine.(*ecsengine.DockerTaskEngine)
dockerTaskEngine.State().AddTask(testTask)
dockerTaskEngine.State().AddContainer(
&apicontainer.DockerContainer{
DockerID: container.ID,
DockerName: "gremlin",
Container: testTask.Containers[0],
},
testTask)

// Inspect the container and populate the container's network mode
dockerContainer, err := client.ContainerInspect(ctx, container.ID)
require.NoError(t, err, "inspecting container failed")
// Using default network mode
netMode := string(dockerContainer.HostConfig.NetworkMode)
testTask.Containers[0].SetNetworkMode(netMode)

// Simulate container start prior to listener initialization.
time.Sleep(checkPointSleep)
err = engine.MustInit(ctx, taskEngine, defaultCluster, defaultContainerInstance)
require.NoError(t, err, "initializing stats engine failed")
defer engine.containerChangeEventStream.Unsubscribe(containerChangeHandler)

// Wait for the stats collection go routine to start.
time.Sleep(checkPointSleep)
_, taskMetrics, err := engine.GetInstanceMetrics()
assert.NoError(t, err, "getting instance metrics failed")
taskMetric := taskMetrics[0]
for _, containerMetric := range taskMetric.ContainerMetrics {
assert.NotNil(t, containerMetric.StorageStatsSet, "storage stats should be non-empty")
}

err = client.ContainerStop(ctx, container.ID, &timeout)
require.NoError(t, err, "stopping container failed")

err = engine.containerChangeEventStream.WriteToEventStream(dockerapi.DockerContainerChangeEvent{
Status: apicontainerstatus.ContainerStopped,
DockerContainerMetadata: dockerapi.DockerContainerMetadata{
DockerID: container.ID,
},
})
assert.NoError(t, err, "failed to write to container change event stream")
time.Sleep(waitForCleanupSleep)

// Should not contain any metrics after cleanup.
validateIdleContainerMetrics(t, engine)
validateEmptyTaskHealthMetrics(t, engine)
}
Loading