Skip to content

Commit

Permalink
[Response Ops][Task Manager] Expose SLI metrics in HTTP API (elastic#…
Browse files Browse the repository at this point in the history
…162178)

Towards elastic#160334

## Summary

Exposes a new HTTP API at `/api/task_manager/metrics` that collects SLI
metrics for task manager. The following metrics are exposed:
- count of task claim successes & count of task claim tries - this is a
counter metric that keeps track over overall task claim success, not
task claim success of individual background task workers
- count of task run success & count of task runs - this is a counter
metric that keeps track of overall task run successes, as well as
successes grouped by task type. Alerting and action task types are
rolled up into an `alerting` and an `actions` group to allow us to
calculate SLIs across all alerting rules and all actions
- task claim duration in milliseconds - this is a histogram counter
metric that is bucketed into 100 ms buckets

These counter metrics are incremented until a reset event is received,
in which case the counter is reset back to 0. This allows the collection
mechanism (in this case Elastic Agent) to determine the interval at
which these metrics are collected as well as to collect the rate of
change for these SLI metrics without having to perform complicated
Elasticsearch aggregation math. In addition, the counters are reset
every 30 seconds (this is configurable) to avoid providing the metrics
collector with stale data in case of a collector outage.


Flaky test runner:
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/2813

---------

Co-authored-by: Kibana Machine <[email protected]>
  • Loading branch information
ymao1 and kibanamachine authored Aug 10, 2023
1 parent f422484 commit 582d97d
Show file tree
Hide file tree
Showing 39 changed files with 2,469 additions and 86 deletions.
3 changes: 3 additions & 0 deletions x-pack/plugins/task_manager/server/config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ describe('config validation', () => {
},
"max_attempts": 3,
"max_workers": 10,
"metrics_reset_interval": 30000,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
Expand Down Expand Up @@ -81,6 +82,7 @@ describe('config validation', () => {
},
"max_attempts": 3,
"max_workers": 10,
"metrics_reset_interval": 30000,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
Expand Down Expand Up @@ -137,6 +139,7 @@ describe('config validation', () => {
},
"max_attempts": 3,
"max_workers": 10,
"metrics_reset_interval": 30000,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
Expand Down
107 changes: 57 additions & 50 deletions x-pack/plugins/task_manager/server/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
export const DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW = 50;
export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;

export const DEFAULT_METRICS_RESET_INTERVAL = 30 * 1000; // 30 seconds

// At the default poll interval of 3sec, this averages over the last 15sec.
export const DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW = 5;

Expand Down Expand Up @@ -52,53 +54,63 @@ const eventLoopDelaySchema = schema.object({
});

const requeueInvalidTasksConfig = schema.object({
enabled: schema.boolean({ defaultValue: false }),
delay: schema.number({ defaultValue: 3000, min: 0 }),
enabled: schema.boolean({ defaultValue: false }),
max_attempts: schema.number({ defaultValue: 100, min: 1, max: 500 }),
});

export const configSchema = schema.object(
{
allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
ephemeral_tasks: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 10,
min: 1,
max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
}),
}),
event_loop_delay: eventLoopDelaySchema,
/* The maximum number of times a task will be attempted before being abandoned as failed */
max_attempts: schema.number({
defaultValue: 3,
min: 1,
}),
/* How often, in milliseconds, the task manager will look for more work. */
poll_interval: schema.number({
defaultValue: DEFAULT_POLL_INTERVAL,
min: 100,
}),
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 1000,
min: 1,
}),
/* The maximum number of tasks that this Kibana instance will run simultaneously. */
max_workers: schema.number({
defaultValue: DEFAULT_MAX_WORKERS,
// disable the task manager rather than trying to specify it with 0 workers
min: 1,
}),
/* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
version_conflict_threshold: schema.number({
defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
min: 50,
max: 100,
}),
/* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
monitored_stats_required_freshness: schema.number({
defaultValue: (config?: unknown) =>
((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
min: 100,
/* The interval at which monotonically increasing metrics counters will reset */
metrics_reset_interval: schema.number({
defaultValue: DEFAULT_METRICS_RESET_INTERVAL,
min: 10 * 1000, // minimum 10 seconds
}),
/* The rate at which we refresh monitored stats that require aggregation queries against ES. */
monitored_aggregated_stats_refresh_rate: schema.number({
defaultValue: DEFAULT_MONITORING_REFRESH_RATE,
/* don't run monitored stat aggregations any faster than once every 5 seconds */
min: 5000,
}),
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
defaultValue: 'debug',
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
/* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
monitored_stats_required_freshness: schema.number({
defaultValue: (config?: unknown) =>
((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
min: 100,
}),
/* The size of the running average window for monitored stats. */
monitored_stats_running_average_window: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW,
Expand All @@ -107,44 +119,39 @@ export const configSchema = schema.object(
}),
/* Task Execution result warn & error thresholds. */
monitored_task_execution_thresholds: schema.object({
default: taskExecutionFailureThresholdSchema,
custom: schema.recordOf(schema.string(), taskExecutionFailureThresholdSchema, {
defaultValue: {},
}),
default: taskExecutionFailureThresholdSchema,
}),
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
defaultValue: 'debug',
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
ephemeral_tasks: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 10,
min: 1,
max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
}),
/* How often, in milliseconds, the task manager will look for more work. */
poll_interval: schema.number({
defaultValue: DEFAULT_POLL_INTERVAL,
min: 100,
}),
event_loop_delay: eventLoopDelaySchema,
worker_utilization_running_average_window: schema.number({
defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
max: 100,
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 1000,
min: 1,
}),
requeue_invalid_tasks: requeueInvalidTasksConfig,
/* These are not designed to be used by most users. Please use caution when changing these */
unsafe: schema.object({
exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
authenticate_background_task_utilization: schema.boolean({ defaultValue: true }),
exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
}),
/* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
version_conflict_threshold: schema.number({
defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
min: 50,
max: 100,
}),
worker_utilization_running_average_window: schema.number({
defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
max: 100,
min: 1,
}),
requeue_invalid_tasks: requeueInvalidTasksConfig,
allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
},
{
validate: (config) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ describe('EphemeralTaskLifecycle', () => {
delay: 3000,
max_attempts: 20,
},
metrics_reset_interval: 3000,
...config,
},
elasticsearchAndSOAvailability$,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ describe('managed configuration', () => {
delay: 3000,
max_attempts: 20,
},
metrics_reset_interval: 3000,
});
logger = context.logger.get('taskManager');

Expand Down
Loading

0 comments on commit 582d97d

Please sign in to comment.