Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mesos input: Collect framework_offers and allocator metrics #5719

Merged
merged 4 commits into from
Aug 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions plugins/inputs/mesos/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ For more information, please check the [Mesos Observability Metrics](http://meso
"system",
"agents",
"frameworks",
"framework_offers",
"tasks",
"messages",
"evqueue",
"registrar",
"allocator",
]
## A list of Mesos slaves, default is []
# slaves = []
Expand Down Expand Up @@ -100,6 +102,10 @@ Mesos master metric groups
- master/slaves_connected
- master/slaves_disconnected
- master/slaves_inactive
- master/slave_unreachable_canceled
- master/slave_unreachable_completed
- master/slave_unreachable_scheduled
- master/slaves_unreachable

- frameworks
- master/frameworks_active
Expand All @@ -108,6 +114,22 @@ Mesos master metric groups
- master/frameworks_inactive
- master/outstanding_offers

- framework offers
- master/frameworks/subscribed
- master/frameworks/calls_total
- master/frameworks/calls
- master/frameworks/events_total
- master/frameworks/events
- master/frameworks/operations_total
- master/frameworks/operations
- master/frameworks/tasks/active
- master/frameworks/tasks/terminal
- master/frameworks/offers/sent
- master/frameworks/offers/accepted
- master/frameworks/offers/declined
- master/frameworks/offers/rescinded
- master/frameworks/roles/suppressed

- tasks
- master/tasks_error
- master/tasks_failed
Expand All @@ -117,6 +139,11 @@ Mesos master metric groups
- master/tasks_running
- master/tasks_staging
- master/tasks_starting
- master/tasks_dropped
- master/tasks_gone
- master/tasks_gone_by_operator
- master/tasks_killing
- master/tasks_unreachable

- messages
- master/invalid_executor_to_framework_messages
Expand Down Expand Up @@ -155,11 +182,17 @@ Mesos master metric groups
- master/task_lost/source_master/reason_slave_removed
- master/task_lost/source_slave/reason_executor_terminated
- master/valid_executor_to_framework_messages
- master/invalid_operation_status_update_acknowledgements
- master/messages_operation_status_update_acknowledgement
- master/messages_reconcile_operations
- master/messages_suppress_offers
- master/valid_operation_status_update_acknowledgements

- evqueue
- master/event_queue_dispatches
- master/event_queue_http_requests
- master/event_queue_messages
- master/operator_event_stream_subscribers

- registrar
- registrar/state_fetch_ms
Expand All @@ -172,6 +205,45 @@ Mesos master metric groups
- registrar/state_store_ms/p99
- registrar/state_store_ms/p999
- registrar/state_store_ms/p9999
- registrar/state_store_ms/count
- registrar/log/ensemble_size
- registrar/log/recovered
- registrar/queued_operations
- registrar/registry_size_bytes

- allocator
- allocator/allocation_run_ms
- allocator/allocation_run_ms/count
- allocator/allocation_run_ms/max
- allocator/allocation_run_ms/min
- allocator/allocation_run_ms/p50
- allocator/allocation_run_ms/p90
- allocator/allocation_run_ms/p95
- allocator/allocation_run_ms/p99
- allocator/allocation_run_ms/p999
- allocator/allocation_run_ms/p9999
- allocator/allocation_runs
- allocator/allocation_run_latency_ms
- allocator/allocation_run_latency_ms/count
- allocator/allocation_run_latency_ms/max
- allocator/allocation_run_latency_ms/min
- allocator/allocation_run_latency_ms/p50
- allocator/allocation_run_latency_ms/p90
- allocator/allocation_run_latency_ms/p95
- allocator/allocation_run_latency_ms/p99
- allocator/allocation_run_latency_ms/p999
- allocator/allocation_run_latency_ms/p9999
- allocator/roles/shares/dominant
- allocator/event_queue_dispatches
- allocator/offer_filters/roles/active
- allocator/quota/roles/resources/offered_or_allocated
- allocator/quota/roles/resources/guarantee
- allocator/resources/cpus/offered_or_allocated
- allocator/resources/cpus/total
- allocator/resources/disk/offered_or_allocated
- allocator/resources/disk/total
- allocator/resources/mem/offered_or_allocated
- allocator/resources/mem/total

Mesos slave metric groups
- resources
Expand Down
54 changes: 50 additions & 4 deletions plugins/inputs/mesos/mesos.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ type Mesos struct {
}

var allMetrics = map[Role][]string{
MASTER: {"resources", "master", "system", "agents", "frameworks", "tasks", "messages", "evqueue", "registrar"},
MASTER: {"resources", "master", "system", "agents", "frameworks", "framework_offers", "tasks", "messages", "evqueue", "registrar", "allocator"},
SLAVE: {"resources", "agent", "system", "executors", "tasks", "messages"},
}

Expand All @@ -58,10 +58,12 @@ var sampleConfig = `
"system",
"agents",
"frameworks",
"framework_offers",
"tasks",
"messages",
"evqueue",
"registrar",
"allocator",
]
## A list of Mesos slaves, default is []
# slaves = []
Expand Down Expand Up @@ -305,6 +307,10 @@ func getMetrics(role Role, group string) []string {
"master/slaves_connected",
"master/slaves_disconnected",
"master/slaves_inactive",
"master/slave_unreachable_canceled",
"master/slave_unreachable_completed",
"master/slave_unreachable_scheduled",
"master/slaves_unreachable",
}

m["frameworks"] = []string{
Expand All @@ -315,6 +321,12 @@ func getMetrics(role Role, group string) []string {
"master/outstanding_offers",
}

// framework_offers and allocator metrics have unpredictable names, so they can't be listed here.
// These empty groups are included to prevent the "unknown metrics group" info log below.
// filterMetrics() filters these metrics by looking for names with the corresponding prefix.
m["framework_offers"] = []string{}
m["allocator"] = []string{}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we don't list the metrics here and remove the logic in gatherMainMetrics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Allocator and framework metric names may include values that can't be predicted, such as framework ID and role (docs here and here). That prevents us from listing all the possible allocator and framework metrics here.


m["tasks"] = []string{
"master/tasks_error",
"master/tasks_failed",
Expand All @@ -324,6 +336,11 @@ func getMetrics(role Role, group string) []string {
"master/tasks_running",
"master/tasks_staging",
"master/tasks_starting",
"master/tasks_dropped",
"master/tasks_gone",
"master/tasks_gone_by_operator",
"master/tasks_killing",
"master/tasks_unreachable",
}

m["messages"] = []string{
Expand Down Expand Up @@ -363,12 +380,18 @@ func getMetrics(role Role, group string) []string {
"master/task_lost/source_master/reason_slave_removed",
"master/task_lost/source_slave/reason_executor_terminated",
"master/valid_executor_to_framework_messages",
"master/invalid_operation_status_update_acknowledgements",
"master/messages_operation_status_update_acknowledgement",
"master/messages_reconcile_operations",
"master/messages_suppress_offers",
"master/valid_operation_status_update_acknowledgements",
}

m["evqueue"] = []string{
"master/event_queue_dispatches",
"master/event_queue_http_requests",
"master/event_queue_messages",
"master/operator_event_stream_subscribers",
}

m["registrar"] = []string{
Expand All @@ -382,6 +405,11 @@ func getMetrics(role Role, group string) []string {
"registrar/state_store_ms/p99",
"registrar/state_store_ms/p999",
"registrar/state_store_ms/p9999",
"registrar/log/ensemble_size",
"registrar/log/recovered",
"registrar/queued_operations",
"registrar/registry_size_bytes",
"registrar/state_store_ms/count",
}
} else if role == SLAVE {
m["resources"] = []string{
Expand Down Expand Up @@ -477,9 +505,27 @@ func (m *Mesos) filterMetrics(role Role, metrics *map[string]interface{}) {
}

for _, k := range metricsDiff(role, selectedMetrics) {
for _, v := range getMetrics(role, k) {
if _, ok = (*metrics)[v]; ok {
delete((*metrics), v)
switch k {
// allocator and framework_offers metrics have unpredictable names, so we have to identify them by name prefix.
case "allocator":
for m := range *metrics {
if strings.HasPrefix(m, "allocator/") {
delete((*metrics), m)
}
}
case "framework_offers":
for m := range *metrics {
if strings.HasPrefix(m, "master/frameworks/") || strings.HasPrefix(m, "frameworks/") {
delete((*metrics), m)
}
}

// All other metrics have predictable names. We can use getMetrics() to retrieve them.
default:
for _, v := range getMetrics(role, k) {
if _, ok = (*metrics)[v]; ok {
delete((*metrics), v)
}
}
}
}
Expand Down
Loading