[Response Ops][Task Manager] Expose SLI metrics in HTTP API (elastic#…

…162178) Towards elastic#160334 ## Summary Exposes a new HTTP API at `/api/task_manager/metrics` that collects SLI metrics for task manager. The following metrics are exposed: - count of task claim successes & count of task claim tries - this is a counter metric that keeps track over overall task claim success, not task claim success of individual background task workers - count of task run success & count of task runs - this is a counter metric that keeps track of overall task run successes, as well as successes grouped by task type. Alerting and action task types are rolled up into an `alerting` and an `actions` group to allow us to calculate SLIs across all alerting rules and all actions - task claim duration in milliseconds - this is a histogram counter metric that is bucketed into 100 ms buckets These counter metrics are incremented until a reset event is received, in which case the counter is reset back to 0. This allows the collection mechanism (in this case Elastic Agent) to determine the interval at which these metrics are collected as well as to collect the rate of change for these SLI metrics without having to perform complicated Elasticsearch aggregation math. In addition, the counters are reset every 30 seconds (this is configurable) to avoid providing the metrics collector with stale data in case of a collector outage. Flaky test runner: https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/2813 --------- Co-authored-by: Kibana Machine <[email protected]>
mistic · Aug 10, 2023 · 582d97d · 582d97d
1 parent f422484
commit 582d97d
Show file tree

Hide file tree

Showing 39 changed files with 2,469 additions and 86 deletions.
diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts
@@ -23,6 +23,7 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
+        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,
@@ -81,6 +82,7 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
+        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,
@@ -137,6 +139,7 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
+        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,

diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts
@@ -20,6 +20,8 @@ export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
 export const DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW = 50;
 export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;
 
+export const DEFAULT_METRICS_RESET_INTERVAL = 30 * 1000; // 30 seconds
+
 // At the default poll interval of 3sec, this averages over the last 15sec.
 export const DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW = 5;
 
@@ -52,53 +54,63 @@ const eventLoopDelaySchema = schema.object({
 });
 
 const requeueInvalidTasksConfig = schema.object({
-  enabled: schema.boolean({ defaultValue: false }),
   delay: schema.number({ defaultValue: 3000, min: 0 }),
+  enabled: schema.boolean({ defaultValue: false }),
   max_attempts: schema.number({ defaultValue: 100, min: 1, max: 500 }),
 });
 
 export const configSchema = schema.object(
   {
+    allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
+    ephemeral_tasks: schema.object({
+      enabled: schema.boolean({ defaultValue: false }),
+      /* How many requests can Task Manager buffer before it rejects new requests. */
+      request_capacity: schema.number({
+        // a nice round contrived number, feel free to change as we learn how it behaves
+        defaultValue: 10,
+        min: 1,
+        max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
+      }),
+    }),
+    event_loop_delay: eventLoopDelaySchema,
     /* The maximum number of times a task will be attempted before being abandoned as failed */
     max_attempts: schema.number({
       defaultValue: 3,
       min: 1,
     }),
-    /* How often, in milliseconds, the task manager will look for more work. */
-    poll_interval: schema.number({
-      defaultValue: DEFAULT_POLL_INTERVAL,
-      min: 100,
-    }),
-    /* How many requests can Task Manager buffer before it rejects new requests. */
-    request_capacity: schema.number({
-      // a nice round contrived number, feel free to change as we learn how it behaves
-      defaultValue: 1000,
-      min: 1,
-    }),
     /* The maximum number of tasks that this Kibana instance will run simultaneously. */
     max_workers: schema.number({
       defaultValue: DEFAULT_MAX_WORKERS,
       // disable the task manager rather than trying to specify it with 0 workers
       min: 1,
     }),
-    /* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
-    version_conflict_threshold: schema.number({
-      defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
-      min: 50,
-      max: 100,
-    }),
-    /* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
-    monitored_stats_required_freshness: schema.number({
-      defaultValue: (config?: unknown) =>
-        ((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
-      min: 100,
+    /* The interval at which monotonically increasing metrics counters will reset */
+    metrics_reset_interval: schema.number({
+      defaultValue: DEFAULT_METRICS_RESET_INTERVAL,
+      min: 10 * 1000, // minimum 10 seconds
     }),
     /* The rate at which we refresh monitored stats that require aggregation queries against ES. */
     monitored_aggregated_stats_refresh_rate: schema.number({
       defaultValue: DEFAULT_MONITORING_REFRESH_RATE,
       /* don't run monitored stat aggregations any faster than once every 5 seconds */
       min: 5000,
     }),
+    monitored_stats_health_verbose_log: schema.object({
+      enabled: schema.boolean({ defaultValue: false }),
+      level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
+        defaultValue: 'debug',
+      }),
+      /* The amount of seconds we allow a task to delay before printing a warning server log */
+      warn_delayed_task_start_in_seconds: schema.number({
+        defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
+      }),
+    }),
+    /* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
+    monitored_stats_required_freshness: schema.number({
+      defaultValue: (config?: unknown) =>
+        ((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
+      min: 100,
+    }),
     /* The size of the running average window for monitored stats. */
     monitored_stats_running_average_window: schema.number({
       defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW,
@@ -107,44 +119,39 @@ export const configSchema = schema.object(
     }),
     /* Task Execution result warn & error thresholds. */
     monitored_task_execution_thresholds: schema.object({
-      default: taskExecutionFailureThresholdSchema,
       custom: schema.recordOf(schema.string(), taskExecutionFailureThresholdSchema, {
         defaultValue: {},
       }),
+      default: taskExecutionFailureThresholdSchema,
     }),
-    monitored_stats_health_verbose_log: schema.object({
-      enabled: schema.boolean({ defaultValue: false }),
-      level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
-        defaultValue: 'debug',
-      }),
-      /* The amount of seconds we allow a task to delay before printing a warning server log */
-      warn_delayed_task_start_in_seconds: schema.number({
-        defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
-      }),
-    }),
-    ephemeral_tasks: schema.object({
-      enabled: schema.boolean({ defaultValue: false }),
-      /* How many requests can Task Manager buffer before it rejects new requests. */
-      request_capacity: schema.number({
-        // a nice round contrived number, feel free to change as we learn how it behaves
-        defaultValue: 10,
-        min: 1,
-        max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
-      }),
+    /* How often, in milliseconds, the task manager will look for more work. */
+    poll_interval: schema.number({
+      defaultValue: DEFAULT_POLL_INTERVAL,
+      min: 100,
     }),
-    event_loop_delay: eventLoopDelaySchema,
-    worker_utilization_running_average_window: schema.number({
-      defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
-      max: 100,
+    /* How many requests can Task Manager buffer before it rejects new requests. */
+    request_capacity: schema.number({
+      // a nice round contrived number, feel free to change as we learn how it behaves
+      defaultValue: 1000,
       min: 1,
     }),
+    requeue_invalid_tasks: requeueInvalidTasksConfig,
     /* These are not designed to be used by most users. Please use caution when changing these */
     unsafe: schema.object({
-      exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
       authenticate_background_task_utilization: schema.boolean({ defaultValue: true }),
+      exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
+    }),
+    /* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
+    version_conflict_threshold: schema.number({
+      defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
+      min: 50,
+      max: 100,
+    }),
+    worker_utilization_running_average_window: schema.number({
+      defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
+      max: 100,
+      min: 1,
     }),
-    requeue_invalid_tasks: requeueInvalidTasksConfig,
-    allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
   },
   {
     validate: (config) => {

diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts
@@ -84,6 +84,7 @@ describe('EphemeralTaskLifecycle', () => {
           delay: 3000,
           max_attempts: 20,
         },
+        metrics_reset_interval: 3000,
         ...config,
       },
       elasticsearchAndSOAvailability$,

diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
@@ -79,6 +79,7 @@ describe('managed configuration', () => {
         delay: 3000,
         max_attempts: 20,
       },
+      metrics_reset_interval: 3000,
     });
     logger = context.logger.get('taskManager');
 

diff --git a/...nitoring/runtime_statistics_aggregator.ts → ...rver/lib/runtime_statistics_aggregator.ts b/...nitoring/runtime_statistics_aggregator.ts → ...rver/lib/runtime_statistics_aggregator.ts