[Ingest Management] Agent expose metrics #22793

michalpristas · 2020-11-30T15:21:51Z

What does this PR do?

Using system package focused on agent process we are collecting CPU,disk and memory metrics which are sent to ds.elastic_agent-elastic-agent

At first i was playing with exposing endpoint and using beat module to collect some information about agent but i let it go as most of information collected using this module is not relevant expect for go-routines and it makes code bloated with unnecessary setups providing empty values for fields which are noncollectable/unreportable from agent point of view.

Why is it important?

#22394

Checklist

My code follows the style guidelines of this project
I have commented my code, particularly in hard-to-understand areas
I have made corresponding changes to the documentation
I have made corresponding change to the default configuration files
I have added tests that prove my fix is effective or that my feature works
I have added an entry in CHANGELOG.next.asciidoc or CHANGELOG-developer.next.asciidoc.

Example of final doc

linux

{
  "_index": ".ds-metrics-elastic_agent.elastic_agent-default-000001",
  "_id": "1d6qYHYBIHKyMD4EYWSe",
  "_version": 1,
  "_score": null,
  "_source": {
    "@timestamp": "2020-12-14T09:52:26.506Z",
    "ecs": {
      "version": "1.7.0"
    },
    "metricset": {
      "period": 10000,
      "name": "json"
    },
    "service": {
      "address": "http://unix/stats",
      "type": "http"
    },
    "elastic_agent": {
      "snapshot": false,
      "version": "8.0.0",
      "id": "76069e50-3df1-11eb-a870-73a635b35320",
      "process": "elastic-agent"
    },
    "agent": {
      "version": "8.0.0",
      "ephemeral_id": "2198b42f-be51-45f9-acfe-0c3e47021d64",
      "id": "5ce18284-c42b-46a4-9149-5df70be787a6",
      "name": "vagrant",
      "type": "metricbeat"
    },
    "event": {
      "dataset": "elastic_agent.elastic_agent",
      "module": "http",
      "duration": 9794045
    },
    "host": {
      "architecture": "x86_64",
      "os": {
        "platform": "ubuntu",
        "version": "16.04.1 LTS (Xenial Xerus)",
        "family": "debian",
        "name": "Ubuntu",
        "kernel": "4.4.0-31-generic",
        "codename": "xenial"
      },
      "name": "vagrant",
      "id": "c0cc2a7efa902a719ada8ab6584b6bcb",
      "containerized": false,
      "ip": [
        "172.17.0.1",
      ],
      "mac": [
        "08:00:27:08:27:32",
      ],
      "hostname": "vagrant"
    },
    "data_stream": {
      "dataset": "elastic_agent.elastic_agent",
      "namespace": "default",
      "type": "metrics"
    },
    "system": {
      "process": {
        "cpu": {
          "system": {
            "ticks": 1190,
            "time": {
              "ms": 1196
            }
          },
          "total": {
            "time": {
              "ms": 4464
            },
            "value": 4450,
            "ticks": 4450
          },
          "user": {
            "ticks": 3260,
            "time": {
              "ms": 3268
            }
          }
        },
        "memory": {
          "size": 73482496
        },
        "fd": {
          "limit": {
            "hard": 4096,
            "soft": 1024
          },
          "open": 21
        },
        "cgroup": {
          "cpu": {
            "id": "elastic-agent.service",
            "stats": {
              "throttled": {
                "ns": 0,
                "periods": 0
              },
              "periods": 0
            },
            "cfs": {
              "quota": {
                "us": 0
              },
              "period": {
                "us": 100000
              }
            }
          },
          "cpuacct": {
            "id": "elastic-agent.service",
            "total": {
              "ns": 13885517853
            }
          },
          "memory": {
            "id": "elastic-agent.service",
            "mem": {
              "usage": {
                "bytes": 428773376
              },
              "limit": {
                "bytes": 9223372036854772000
              }
            }
          }
        }
      }
    }
  },
  "fields": {
    "@timestamp": [
      "2020-12-14T09:52:26.506Z"
    ]
  },
  "sort": [
    1607939546506
  ]
}

mac

{
	"_index": ".ds-metrics-elastic_agent.elastic_agent-default-000001",
	"_id": "2QxlPHYBjGFDnaF_EkU-",
	"_version": 1,
	"_score": null,
	"_source": {
		"@timestamp": "2020-12-07T08:50:24.348Z",
		"event": {
			"dataset": "elastic_agent.elastic_agent",
			"module": "http",
			"duration": 3040126
		},
		"metricset": {
			"name": "json",
			"period": 10000
		},
		"system": {
			"process": {
				"cpu": {
					"system": {
						"ticks": 1745,
						"time": {
							"ms": 1745
						}
					},
					"total": {
						"ticks": 7291,
						"time": {
							"ms": 7291
						},
						"value": 7291
					},
					"user": {
						"time": {
							"ms": 5546
						},
						"ticks": 5546
					}
				},
				"memory": {
					"size": 74531072
				}
			}
		},
		"host": {
			"mac": [
				"ac:de:48:ac:de:48"
			],
			"name": "MacBook-Pro-2.local",
			"hostname": "MacBook-Pro-2.local",
			"architecture": "x86_64",
			"os": {
				"name": "Mac OS X",
				"kernel": "18.7.0",
				"build": "18G6032",
				"platform": "darwin",
				"version": "10.14.6",
				"family": "darwin"
			},
			"id": "FC609F24-07E1-54EA-8E33-56F9D5A7A97E",
			"ip": [
				"127.0.0.2"
			]
		},
		"agent": {
			"ephemeral_id": "0cf156d9-4398-4c29-a52d-596ec7a93f5f",
			"id": "e09c86a1-f5dd-4fe8-898c-70de832e2a9e",
			"name": "MacBook-Pro-2.local",
			"type": "metricbeat",
			"version": "8.0.0"
		},
		"service": {
			"address": "http://unix/stats",
			"type": "http"
		},
		"data_stream": {
			"dataset": "elastic_agent.elastic_agent",
			"namespace": "default",
			"type": "metrics"
		},
		"elastic_agent": {
			"snapshot": false,
			"version": "8.0.0",
			"id": "02e6478a-72b9-4a5e-bd63-0f6be2ef4dba",
			"process": "elastic-agent"
		},
		"ecs": {
			"version": "1.6.0"
		}
	},
	"fields": {
		"@timestamp": [
			"2020-12-07T08:50:24.348Z"
		]
	},
	"sort": [
		1607331024348
	]
}

[Ingest Manager] Log level reloadable from fleet (elastic#22690)

elasticmachine · 2020-11-30T15:21:54Z

Pinging @elastic/ingest-management (Team:Ingest Management)

elasticmachine · 2020-11-30T15:44:48Z

💚 Build Succeeded

the below badges are clickable and redirect to their specific view in the CI or DOCS

Expand to view the summary

Build stats

Build Cause: Pull request #22793 updated
Start Time: 2020-12-10T19:01:36.506+0000
Duration: 54 min 43 sec

Test stats 🧪

Test	Results
Failed	0
Passed	17385
Skipped	1379
Total	18764

Steps errors

Expand to view the steps failures

`Terraform Apply on x-pack/metricbeat/module/aws`

Took 0 min 15 sec . View more details on here

`Terraform Apply on x-pack/metricbeat/module/aws`

Took 0 min 17 sec . View more details on here

💚 Flaky test report

Tests succeeded.

Expand to view the summary

Test stats 🧪

Test	Results
Failed	0
Passed	17385
Skipped	1379
Total	18764

blakerouse · 2020-11-30T18:16:58Z

x-pack/elastic-agent/pkg/agent/operation/monitoring.go

+					"target": "data_stream",
+					"fields": map[string]interface{}{
+						"type":      "metrics",
+						"dataset":   fmt.Sprintf("elastic_agent.%s", agentName),


Do we do this for logs? Is it elastic_agent.elastic-agent? I think its just elastic_agent.

The - in elastic-agent is a problem as well. We should not have that, because we had to change endpoint-security to endpoint_security otherwise it breaks how the namespace is used of ending with -default.

…etrics

ph · 2020-12-01T20:34:06Z

Interesting I have never considered doing it that way, maybe I was always focused on the libbeat thing, I think we are losing file descriptor and the number of goroutines?

@blakerouse @ruflin Is that an appropriate way?

blakerouse · 2020-12-01T22:21:12Z

x-pack/elastic-agent/pkg/agent/operation/monitoring.go

+					"target": "data_stream",
+					"fields": map[string]interface{}{
+						"type":      "metrics",
+						"dataset":   fmt.Sprintf("elastic_agent.%s", fixedAgentName),


Still question if it should have the ending .elastic_agent? Why not just metrics-elastic_agent-default? Or does that not match the logs?

was thinking about following convention, i like shorter one better as well

blakerouse · 2020-12-01T22:22:54Z

@michalpristas what type of metrics are we getting with this vs the libbeat way? I assume this is more system based like overall CPU/memory of the process?

Seems like it smart way to target the Elastic Agent at a system usage overall level.

michalpristas · 2020-12-02T06:48:29Z

@blakerouse libbeat way was collecting a lot of unusable metrics. what are exposed using /stat and /state endpoint is used in beat module of metricsbeat

for state it's

"state": {
            "management": {
                "enabled": false
            },
            "module": {
                "count": 3
            },
            "output": {
                "name": "elasticsearch"
            },
            "queue": {
                "name": "mem"
            }

as we are always management enabled if we consider fleet a management there's nothing else in state we are interested in, we dont have output just yet and no queue

for stats:

 "libbeat": {
                "output": {
                    "events": {
                        "acked": 0,
                        "active": 0,
                        "batches": 0,
                        "dropped": 0,
                        "duplicates": 0,
                        "failed": 0,
                        "toomany": 0,
                        "total": 0
                    },
                    "read": {
                        "bytes": 0,
                        "errors": 0
                    },
                    "type": "elasticsearch",
                    "write": {
                        "bytes": 0,
                        "errors": 0
                    }
                }
            },
            "runtime": {
                "goroutines": 39
            },
            "uptime": {
                "ms": 12019
            }

again we dont have direct output event we can monitor, just runtime.goroutines is something interesting and uptime.

with system process we collect much valuable information https://www.elastic.co/guide/en/beats/metricbeat/current/metricbeat-metricset-system-process.html

ph · 2020-12-02T15:26:10Z

@michalpristas Can you add a JSON document that is generated and we can have @ravikesarwani review it?

ruflin · 2020-12-02T15:35:46Z

Are there Elastic Agent specific metrics we want to add? For example how many processes are running, how many config changes and similar? If we add these, where would these be added?

michalpristas · 2020-12-02T17:57:36Z

@ruflin i suppose we could create custom module for metricbeat which will monitor whatever agent exposes on predefined endpoint?

…etrics

ruflin · 2020-12-03T12:22:55Z

Maybe we have this module already and we just use http?

…etrics

nchaulet · 2020-12-10T18:56:08Z

x-pack/elastic-agent/pkg/agent/operation/monitoring.go

+							"from": "http.agent.beat.handles",
+							"to":   "system.process.fd",
+						},
+						// I should be able to see fd usage. Am I keep too many files open?


a small typo in the comment?

yes copycat striked here

not copycat, my file just got paritally saved, i hate when this happens

nchaulet · 2020-12-10T19:08:06Z

@michalpristas I am missing something to have memory per process? Looks like the memory is send only for the elastic-agent events

michalpristas · 2020-12-10T19:13:40Z

@nchaulet try rebuilding all beats (from x-pack directory running rm -rf */build then package agent)

nchaulet · 2020-12-10T19:50:51Z

it's working :)

ruflin · 2020-12-14T07:33:25Z

@michalpristas Could you put an example of the final version of the doc into the PR description?

michalpristas · 2020-12-14T07:42:56Z

@ruflin done

ruflin · 2020-12-14T07:55:21Z

LGTM. Nit: An empty http: {} is still around.

michalpristas · 2020-12-14T07:58:14Z

in reality its not, i was just replacing metrics section with updated one here

simitt · 2020-12-14T08:29:03Z

The final doc does not show any system.process.cgroup.cpu.* data. @michalpristas any idea why they are missing?

michalpristas · 2020-12-14T09:30:33Z

will take a look need to spinup kibana

michalpristas · 2020-12-14T09:54:58Z

@simitt i copied entired document for linux client now instead of modifying local changes manually
cpu seems to be there, no http: {}

simitt · 2020-12-14T10:23:12Z

Thanks @michalpristas! The metrics LGTM.

michalpristas · 2020-12-14T12:49:33Z

@ph @ruflin @blakerouse can i get approval, seems metrics are fine, nicolas is already building dashboards on top of this agent

ruflin

Approving the doc structure, I'll leave to @blakerouse to check the code.

blakerouse

Code looks good.

* [Ingest Manager] Log level reloadable from fleet (elastic#22690) [Ingest Manager] Log level reloadable from fleet (elastic#22690) * aa * create drop * updated drop * process contains everything * drop start time * undo exposed endpoint * sanitize dataset name * ups * agent expose http * collect all metrics from beats * colelct all from beats * golint * cleaner docs * updated structure * cgroup * long live file saving issues (cherry picked from commit 49c8d87)

…23105) * [Ingest Management] Agent expose metrics (#22793) * [Ingest Manager] Log level reloadable from fleet (#22690) [Ingest Manager] Log level reloadable from fleet (#22690) * aa * create drop * updated drop * process contains everything * drop start time * undo exposed endpoint * sanitize dataset name * ups * agent expose http * collect all metrics from beats * colelct all from beats * golint * cleaner docs * updated structure * cgroup * long live file saving issues (cherry picked from commit 49c8d87) * Add changelog. Co-authored-by: Michal Pristas <[email protected]>

michalpristas added 8 commits November 26, 2020 09:10

[Ingest Manager] Log level reloadable from fleet (elastic#22690)

7fabe99

[Ingest Manager] Log level reloadable from fleet (elastic#22690)

aa

111722d

create drop

1602106

updated drop

f84f24c

process contains everything

7a29a0a

drop start time

e254ed3

conflicts

24264e9

undo exposed endpoint

4fcd9bd

michalpristas added enhancement Team:Ingest Management labels Nov 30, 2020

michalpristas self-assigned this Nov 30, 2020

botelastic bot added needs_team Indicates that the issue/PR needs a Team:* label and removed needs_team Indicates that the issue/PR needs a Team:* label labels Nov 30, 2020

blakerouse reviewed Nov 30, 2020

View reviewed changes

michalpristas added 3 commits December 1, 2020 09:23

Merge branch 'master' of github.com:elastic/beats into agent-expose-m…

c6ab1ed

…etrics

sanitize dataset name

d7b98b1

ups

725dd52

michalpristas changed the title ~~Agent expose metrics~~ [Ingest Management] Agent expose metrics Dec 1, 2020

blakerouse reviewed Dec 1, 2020

View reviewed changes

Merge branch 'master' of github.com:elastic/beats into agent-expose-m…

e680cf6

…etrics

agent expose http

d27a9b0

michalpristas added 5 commits December 9, 2020 14:17

cleaner docs

23ec64a

Merge branch 'master' of github.com:elastic/beats into agent-expose-m…

a341dca

…etrics

updated structure

ed6da5f

Merge branch 'master' of github.com:elastic/beats into agent-expose-m…

c837bc2

…etrics

cgroup

2d7db46

michalpristas mentioned this pull request Dec 10, 2020

[Elastic Agent] Collect Elastic Agent metrics and send to Elasticsearch #22394

Closed

7 tasks

nchaulet reviewed Dec 10, 2020

View reviewed changes

long live file saving issues

f785fda

ruflin approved these changes Dec 14, 2020

View reviewed changes

blakerouse approved these changes Dec 14, 2020

View reviewed changes

blakerouse merged commit 49c8d87 into elastic:master Dec 14, 2020

blakerouse mentioned this pull request Dec 14, 2020

Cherry-pick #22793 to 7.x: [Ingest Management] Agent expose metrics #23105

Merged

6 tasks

blakerouse added the v7.11.0 label Dec 14, 2020

blakerouse mentioned this pull request Dec 14, 2020

[Elastic Agent] Add changelog entry for metrics collection #23106

Merged

1 task

nchaulet mentioned this pull request Dec 14, 2020

[Fleet] Add Elastic agent integration elastic/integrations#462

Merged

2 tasks

simitt mentioned this pull request Jan 27, 2021

[elastic-agent] fix: cpu cgroup values #23714

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Ingest Management] Agent expose metrics #22793

[Ingest Management] Agent expose metrics #22793

michalpristas commented Nov 30, 2020 •

edited

Loading

elasticmachine commented Nov 30, 2020

elasticmachine commented Nov 30, 2020 •

edited by jenkins-beats-ci bot

Loading

Build stats

Test stats 🧪

`Terraform Apply on x-pack/metricbeat/module/aws`

`Terraform Apply on x-pack/metricbeat/module/aws`

Test stats 🧪

blakerouse Nov 30, 2020

ph commented Dec 1, 2020

blakerouse Dec 1, 2020

michalpristas Dec 2, 2020

blakerouse commented Dec 1, 2020

michalpristas commented Dec 2, 2020 •

edited

Loading

ph commented Dec 2, 2020

ruflin commented Dec 2, 2020

michalpristas commented Dec 2, 2020

ruflin commented Dec 3, 2020

nchaulet Dec 10, 2020

michalpristas Dec 10, 2020

michalpristas Dec 10, 2020

nchaulet commented Dec 10, 2020

michalpristas commented Dec 10, 2020

nchaulet commented Dec 10, 2020

ruflin commented Dec 14, 2020

michalpristas commented Dec 14, 2020

ruflin commented Dec 14, 2020

michalpristas commented Dec 14, 2020

simitt commented Dec 14, 2020

michalpristas commented Dec 14, 2020

michalpristas commented Dec 14, 2020 •

edited

Loading

simitt commented Dec 14, 2020

michalpristas commented Dec 14, 2020

ruflin left a comment

blakerouse left a comment

[Ingest Management] Agent expose metrics #22793

[Ingest Management] Agent expose metrics #22793

Conversation

michalpristas commented Nov 30, 2020 • edited Loading

What does this PR do?

Why is it important?

Checklist

Example of final doc

linux

mac

elasticmachine commented Nov 30, 2020

elasticmachine commented Nov 30, 2020 • edited by jenkins-beats-ci bot Loading

💚 Build Succeeded

Build stats

Test stats 🧪

Steps errors

Terraform Apply on x-pack/metricbeat/module/aws

Terraform Apply on x-pack/metricbeat/module/aws

💚 Flaky test report

Test stats 🧪

blakerouse Nov 30, 2020

Choose a reason for hiding this comment

ph commented Dec 1, 2020

blakerouse Dec 1, 2020

Choose a reason for hiding this comment

michalpristas Dec 2, 2020

Choose a reason for hiding this comment

blakerouse commented Dec 1, 2020

michalpristas commented Dec 2, 2020 • edited Loading

ph commented Dec 2, 2020

ruflin commented Dec 2, 2020

michalpristas commented Dec 2, 2020

ruflin commented Dec 3, 2020

nchaulet Dec 10, 2020

Choose a reason for hiding this comment

michalpristas Dec 10, 2020

Choose a reason for hiding this comment

michalpristas Dec 10, 2020

Choose a reason for hiding this comment

nchaulet commented Dec 10, 2020

michalpristas commented Dec 10, 2020

nchaulet commented Dec 10, 2020

ruflin commented Dec 14, 2020

michalpristas commented Dec 14, 2020

ruflin commented Dec 14, 2020

michalpristas commented Dec 14, 2020

simitt commented Dec 14, 2020

michalpristas commented Dec 14, 2020

michalpristas commented Dec 14, 2020 • edited Loading

simitt commented Dec 14, 2020

michalpristas commented Dec 14, 2020

ruflin left a comment

Choose a reason for hiding this comment

blakerouse left a comment

Choose a reason for hiding this comment

michalpristas commented Nov 30, 2020 •

edited

Loading

elasticmachine commented Nov 30, 2020 •

edited by jenkins-beats-ci bot

Loading

`Terraform Apply on x-pack/metricbeat/module/aws`

`Terraform Apply on x-pack/metricbeat/module/aws`

michalpristas commented Dec 2, 2020 •

edited

Loading

michalpristas commented Dec 14, 2020 •

edited

Loading