From ea2da9c114dff955111e7d4a41862457fb3c0f68 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Fri, 28 May 2021 19:59:08 +0100 Subject: [PATCH] split estimations by observations and proposal --- .../task-manager-troubleshooting.asciidoc | 121 ++++++++++++------ .../monitoring/capacity_estimation.test.ts | 95 +++++++------- .../server/monitoring/capacity_estimation.ts | 73 ++++++----- .../test_suites/task_manager/health_route.ts | 45 +++++-- 4 files changed, 204 insertions(+), 130 deletions(-) diff --git a/docs/user/production-considerations/task-manager-troubleshooting.asciidoc b/docs/user/production-considerations/task-manager-troubleshooting.asciidoc index 94b6e59a36a9b..a9ac7b1eb0fc6 100644 --- a/docs/user/production-considerations/task-manager-troubleshooting.asciidoc +++ b/docs/user/production-considerations/task-manager-troubleshooting.asciidoc @@ -237,33 +237,44 @@ The API returns the following: ["60s", 2], ["5m", 2], ["60m", 4], - ["3600s", 1], - ["720m", 1] + ["3600s", 1], + ["720m", 1] ], - "non_recurring": 18, - "owner_ids": 0, + "non_recurring": 18, + "owner_ids": 0, "overdue": 10, - "overdue_non_recurring": 10, + "overdue_non_recurring": 10, "estimated_schedule_density": [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 3, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0], - "capacity_requirments": { - "per_minute": 6, - "per_hour": 28, - "per_day": 2 - } + "capacity_requirments": { + "per_minute": 6, + "per_hour": 28, + "per_day": 2 + } }, "status": "OK" }, - "capacity_estimation": { + "capacity_estimation": { "timestamp": "2021-02-16T11:38:06.826Z", - "value": { - "minutes_to_drain_overdue": 1, - "min_required_kibana": 1, - "max_throughput_per_minute": 200, - "avg_recurring_required_throughput_per_minute": 7, - "avg_required_throughput_per_minute": 8 - }, + "value": { + "observed": { + "observed_kibana_instances": 1, + "max_throughput_per_minute_per_kibana": 200, + "max_throughput_per_minute": 200, + "minutes_to_drain_overdue": 1, + "avg_recurring_required_throughput_per_minute": 28, + "avg_recurring_required_throughput_per_minute_per_kibana": 28, + "avg_required_throughput_per_minute": 28, + "avg_required_throughput_per_minute_per_kibana": 28 + }, + "proposed": { + "min_required_kibana": 1, + "avg_recurring_required_throughput_per_minute_per_kibana": 28, + "avg_required_throughput_per_minute": 28, + "avg_required_throughput_per_minute_per_kibana": 28 + } + } "status": "OK" - } + } } } -------------------------------------------------- @@ -846,20 +857,34 @@ Evaluating the health stats above you can see the following output under `stats. [source,json] -------------------------------------------------- { - "assumed_kibana_instances": 1, # <1> - "minutes_to_drain_overdue": 1, # <2> - "min_required_kibana": 1, # <3> - "max_throughput_per_minute": 200, # <4> - "avg_recurring_required_throughput_per_minute": 7, # <5> - "avg_required_throughput_per_minute": 8 # <6> + "observed": { + "observed_kibana_instances": 1, # <1> + "minutes_to_drain_overdue": 1, # <2> + "max_throughput_per_minute_per_kibana": 200, + "max_throughput_per_minute": 200, # <3> + "avg_recurring_required_throughput_per_minute": 28, # <4> + "avg_recurring_required_throughput_per_minute_per_kibana": 28, + "avg_required_throughput_per_minute": 28, # <5> + "avg_required_throughput_per_minute_per_kibana": 28 + }, + "proposed": { + "min_required_kibana": 1, # <6> + "avg_recurring_required_throughput_per_minute_per_kibana": 28, + "avg_required_throughput_per_minute": 28, + "avg_required_throughput_per_minute_per_kibana": 28 + } } -------------------------------------------------- <1> These estimates assume that there is one {kib} instance actively executing tasks -<2> Based on past throughput the overdue tasks in the system should be executed within 1 minute -<3> One {kib} instance should be sufficient to run the current workload -<4> Assuming all {kib} instacnes in the cluster are configured the same as this instance, the maximum available throughput is 200 tasks per minute -<5> On average the recurring tasks in the system have historically required a throughput of 7 tasks per minute -<6> On average, regardless of whether they are recurring or otherwise, the tasks in the system have historically required a throughput of 8 tasks per minute +<2> Based on past throughput the overdue tasks in the system could be executed within 1 minute +<3> Assuming all {kib} instacnes in the cluster are configured the same as this instance, the maximum available throughput is 200 tasks per minute +<4> On average the recurring tasks in the system have historically required a throughput of 28 tasks per minute +<5> On average, regardless of whether they are recurring or otherwise, the tasks in the system have historically required a throughput of 28 tasks per minute +<6> One {kib} instance should be sufficient to run the current workload + +The `capacity_estimation` section is made up of two subsections: +* `observed` estimates the current capacity by observing historical runtime and workload statistics +* `proposed` estimates the baseline {kib} cluster size and the expected throughput under such a deployment strategy You can infer from these estimates that the current system is under-utilised and has enough capacity to handle many more tasks than it currently does. @@ -868,26 +893,38 @@ Suppose an alternate scenario, where you see the following output under `stats.c [source,json] -------------------------------------------------- { - "assumed_kibana_instances": 2, # <1> - "minutes_to_drain_overdue": 12, # <2> - "min_required_kibana": 3, # <3> - "max_throughput_per_minute": 400, # <4> - "avg_recurring_required_throughput_per_minute": 285, # <5> - "avg_required_throughput_per_minute": 410 # <6> + "observed": { + "observed_kibana_instances": 2, # <1> + "max_throughput_per_minute_per_kibana": 200, + "max_throughput_per_minute": 400, # <2> + "minutes_to_drain_overdue": 12, # <3> + "avg_recurring_required_throughput_per_minute": 354, # <4> + "avg_recurring_required_throughput_per_minute_per_kibana": 177, # <5> + "avg_required_throughput_per_minute": 434, # <6> + "avg_required_throughput_per_minute_per_kibana": 217 + }, + "proposed": { + "min_required_kibana": 3, # <7> + "avg_recurring_required_throughput_per_minute_per_kibana": 118, # <8> + "avg_required_throughput_per_minute_per_kibana": 145 # <9> + } } -------------------------------------------------- <1> These estimates assume that there are two {kib} instance actively executing tasks -<2> Based on past throughput the overdue tasks in the system should be executed within 12 minute -<3> The system estimates that at least three {kib} instances are required to run the current workload -<4> The maximum available throughput in the system currently is 400 tasks per minute -<5> On average the recurring tasks in the system have historically required a throughput of 285 tasks per minute -<6> On average the tasks in the system have historically required a throughput of 410 tasks per minute +<2> The maximum available throughput in the system currently is 400 tasks per minute +<3> Based on past throughput the overdue tasks in the system should be executed within 12 minute +<4> On average the recurring tasks in the system have historically required a throughput of 354 tasks per minute +<5> On average each {kib} instance utilizes 177 tasks per minute of its capacity to execute recurring tasks +<6> On average the tasks in the system have historically required a throughput of 434 tasks per minute +<7> The system estimates that at least three {kib} instances are required to run the current workload +<8> Once a third {kib} instance is provisioned, the capacity utilized by each instance to execute recurring tasks should drop from 177 to 118 tasks per minute +<9> Taking into account historical ad-hoc task execution, we estimate the throughput required of each {kib} instance will drop from 217 task per minute to 145, once a third {kib} instance is provisioned Evaluating by these estimates, we can infer some interesting attributes of our system: * These estimates are produced based on the assumption that there are two {kib} instances in the cluster. This number is based on the number of {kib} instances actively executing tasks in recent minutes. At times this number might fluctuate if {kib} instances remain idle, so validating these estimates against what you know about the system is recommended. * There appear to be so many overdue tasks that it would take 12 minutes of executions to catch up with that backlog. This does not take into account tasks that might become overdue during those 12 minutes, so while this congestion might be temporary, the system could also remain consistently under provisioned and might never drain the backlog entirely. -* Evauating the recurring tasks in the workload the system requires a throughput of 285 tasks per minute on average to execute tasks on time, which is well below the estimated maximum throughput of 400 tasks per minute. Once we take into account historical throughpout though, we estimate that the required throughput at 410 tasks per minute. This suggests that, historically, over 30% of tasks have been ad-hoc non-recurring tasks, the scale of which are harder to predict than recurring tasks. +* Evauating the recurring tasks in the workload the system requires a throughput of 354 tasks per minute on average to execute tasks on time, which is lower then the estimated maximum throughput of 400 tasks per minute. Once we take into account historical throughpout though, we estimate that the required throughput at 434 tasks per minute. This suggests that, historically, approximately 20% of tasks have been ad-hoc non-recurring tasks, the scale of which are harder to predict than recurring tasks. You can infer from these estimates that the capacity in the current system is insufficient and at least one additional {kib} instance is required in order to keep up with the workload. diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts index 9a78def9ef226..78e00c8e29e72 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts @@ -36,11 +36,11 @@ describe('estimateCapacity', () => { }, } ) - ).value + ).value.observed ).toMatchObject({ - assumed_kibana_instances: 1, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: 200, + observed_kibana_instances: 1, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: 200, }); }); @@ -71,11 +71,11 @@ describe('estimateCapacity', () => { }, } ) - ).value + ).value.observed ).toMatchObject({ - assumed_kibana_instances: 1, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: 200, + observed_kibana_instances: 1, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: 200, }); }); @@ -107,11 +107,11 @@ describe('estimateCapacity', () => { }, } ) - ).value + ).value.observed ).toMatchObject({ - assumed_kibana_instances: 1, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: 200, + observed_kibana_instances: 1, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: 200, }); }); @@ -142,13 +142,13 @@ describe('estimateCapacity', () => { }, } ) - ).value + ).value.observed ).toMatchObject({ - assumed_kibana_instances: 3, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: 3 * 200, // 3 kibana, 200tpm each - assumed_avg_required_throughput_per_minute: 150 + 1, // 150 every minute, plus 60 every hour - assumed_avg_required_throughput_per_minute_per_kibana: Math.ceil((150 + 1) / 3), + observed_kibana_instances: 3, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: 3 * 200, // 3 kibana, 200tpm each + avg_required_throughput_per_minute: 150 + 1, // 150 every minute, plus 60 every hour + avg_required_throughput_per_minute_per_kibana: Math.ceil((150 + 1) / 3), }); }); @@ -190,15 +190,15 @@ describe('estimateCapacity', () => { }, } ) - ).value + ).value.observed ).toMatchObject({ - assumed_kibana_instances: provisionedKibanaInstances, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: provisionedKibanaInstances * 200, // 2 kibana, 200tpm each - assumed_avg_required_throughput_per_minute_per_kibana: Math.ceil( + observed_kibana_instances: provisionedKibanaInstances, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: provisionedKibanaInstances * 200, // 2 kibana, 200tpm each + avg_required_throughput_per_minute_per_kibana: Math.ceil( expectedAverageRequiredCapacityPerKibana ), - assumed_avg_required_throughput_per_minute: Math.ceil( + avg_required_throughput_per_minute: Math.ceil( provisionedKibanaInstances * expectedAverageRequiredCapacityPerKibana ), // same as above but for both instances }); @@ -212,7 +212,10 @@ describe('estimateCapacity', () => { const expectedAverageRequiredCapacityPerKibanaCurrently = 200 * 0.5 + recurringTasksPerMinute / provisionedKibanaInstances; const expectedAverageRequiredCapacityPerKibanaOnceThereAreEnoughServers = - 200 * 0.5 + recurringTasksPerMinute / (provisionedKibanaInstances + 1); + // the non-recurring task load should now be shared between 3 server instead of 2 + (200 * 0.5 * provisionedKibanaInstances) / (provisionedKibanaInstances + 1) + + // so will the recurring tasks + recurringTasksPerMinute / (provisionedKibanaInstances + 1); expect( estimateCapacity( @@ -249,27 +252,27 @@ describe('estimateCapacity', () => { ) ).value ).toMatchObject({ - assumed_kibana_instances: provisionedKibanaInstances, - assumed_minutes_to_drain_overdue: 0, - assumed_max_throughput_per_minute: provisionedKibanaInstances * 200, // 2 kibana, 200tpm each - assumed_avg_required_throughput_per_minute_per_kibana: Math.ceil( - expectedAverageRequiredCapacityPerKibanaCurrently - ), - assumed_avg_required_throughput_per_minute: Math.ceil( - provisionedKibanaInstances * expectedAverageRequiredCapacityPerKibanaCurrently - ), // same as above bt for both instances - min_required_kibana: provisionedKibanaInstances + 1, - avg_recurring_required_throughput_per_minute: Math.ceil(recurringTasksPerMinute), - avg_recurring_required_throughput_per_minute_per_kibana: Math.ceil( - recurringTasksPerMinute / (provisionedKibanaInstances + 1) - ), - avg_required_throughput_per_minute: Math.ceil( - expectedAverageRequiredCapacityPerKibanaOnceThereAreEnoughServers * - (1 + provisionedKibanaInstances) - ), - avg_required_throughput_per_minute_per_kibana: Math.ceil( - expectedAverageRequiredCapacityPerKibanaOnceThereAreEnoughServers - ), + observed: { + observed_kibana_instances: provisionedKibanaInstances, + minutes_to_drain_overdue: 0, + max_throughput_per_minute: provisionedKibanaInstances * 200, // 2 kibana, 200tpm each + avg_recurring_required_throughput_per_minute: Math.ceil(recurringTasksPerMinute), + avg_required_throughput_per_minute_per_kibana: Math.ceil( + expectedAverageRequiredCapacityPerKibanaCurrently + ), + avg_required_throughput_per_minute: Math.ceil( + provisionedKibanaInstances * expectedAverageRequiredCapacityPerKibanaCurrently + ), // same as above bt for both instances + }, + proposed: { + min_required_kibana: provisionedKibanaInstances + 1, + avg_recurring_required_throughput_per_minute_per_kibana: Math.ceil( + recurringTasksPerMinute / (provisionedKibanaInstances + 1) + ), + avg_required_throughput_per_minute_per_kibana: Math.ceil( + expectedAverageRequiredCapacityPerKibanaOnceThereAreEnoughServers + ), + }, }); }); diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts index bbb20b8ea3848..dc0764dbdab7a 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts @@ -10,17 +10,21 @@ import { JsonObject } from 'src/plugins/kibana_utils/common'; import { RawMonitoringStats, RawMonitoredStat, HealthStatus } from './monitoring_stats_stream'; export interface CapacityEstimationStat extends JsonObject { - assumed_kibana_instances: number; - assumed_max_throughput_per_minute: number; - assumed_minutes_to_drain_overdue: number; - min_required_kibana: number; - max_throughput_per_minute_per_kibana: number; - avg_recurring_required_throughput_per_minute: number; - avg_recurring_required_throughput_per_minute_per_kibana: number; - avg_required_throughput_per_minute: number; - avg_required_throughput_per_minute_per_kibana: number; - assumed_avg_required_throughput_per_minute: number; - assumed_avg_required_throughput_per_minute_per_kibana: number; + observed: { + observed_kibana_instances: number; + max_throughput_per_minute: number; + max_throughput_per_minute_per_kibana: number; + minutes_to_drain_overdue: number; + avg_required_throughput_per_minute: number; + avg_required_throughput_per_minute_per_kibana: number; + avg_recurring_required_throughput_per_minute: number; + avg_recurring_required_throughput_per_minute_per_kibana: number; + }; + proposed: { + min_required_kibana: number; + avg_recurring_required_throughput_per_minute_per_kibana: number; + avg_required_throughput_per_minute_per_kibana: number; + }; } export type CapacityEstimationParams = Omit< @@ -118,7 +122,8 @@ export function estimateCapacity( * each kibana need if following the minRequiredKibanaInstances? */ const averageRequiredThroughputPerMinutePerKibana = - averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana + + averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana * + (assumedKibanaInstances / minRequiredKibanaInstances) + averageRecurringRequiredPerMinute / minRequiredKibanaInstances; const assumedAverageRecurringRequiredThroughputPerMinutePerKibana = @@ -140,25 +145,31 @@ export function estimateCapacity( ? HealthStatus.Warning : HealthStatus.Error, timestamp: new Date().toISOString(), - value: mapValues( - { - assumed_kibana_instances: assumedKibanaInstances, - assumed_max_throughput_per_minute: assumedCapacityAvailablePerMinute, - assumed_minutes_to_drain_overdue: - overdue / (assumedKibanaInstances * averageCapacityUsedByPersistedTasksPerKibana), - min_required_kibana: minRequiredKibanaInstances, - max_throughput_per_minute_per_kibana: capacityPerMinutePerKibana, - avg_recurring_required_throughput_per_minute: averageRecurringRequiredPerMinute, - avg_recurring_required_throughput_per_minute_per_kibana: averageRecurringRequiredPerMinutePerKibana, - avg_required_throughput_per_minute: - averageRequiredThroughputPerMinutePerKibana * minRequiredKibanaInstances, - avg_required_throughput_per_minute_per_kibana: averageRequiredThroughputPerMinutePerKibana, - assumed_avg_required_throughput_per_minute: - assumedRequiredThroughputPerMinutePerKibana * assumedKibanaInstances, - assumed_avg_required_throughput_per_minute_per_kibana: assumedRequiredThroughputPerMinutePerKibana, - }, - Math.ceil - ), + value: { + observed: mapValues( + { + observed_kibana_instances: assumedKibanaInstances, + max_throughput_per_minute_per_kibana: capacityPerMinutePerKibana, + max_throughput_per_minute: assumedCapacityAvailablePerMinute, + minutes_to_drain_overdue: + overdue / (assumedKibanaInstances * averageCapacityUsedByPersistedTasksPerKibana), + avg_recurring_required_throughput_per_minute: averageRecurringRequiredPerMinute, + avg_recurring_required_throughput_per_minute_per_kibana: assumedAverageRecurringRequiredThroughputPerMinutePerKibana, + avg_required_throughput_per_minute: + assumedRequiredThroughputPerMinutePerKibana * assumedKibanaInstances, + avg_required_throughput_per_minute_per_kibana: assumedRequiredThroughputPerMinutePerKibana, + }, + Math.ceil + ), + proposed: mapValues( + { + min_required_kibana: minRequiredKibanaInstances, + avg_recurring_required_throughput_per_minute_per_kibana: averageRecurringRequiredPerMinutePerKibana, + avg_required_throughput_per_minute_per_kibana: averageRequiredThroughputPerMinutePerKibana, + }, + Math.ceil + ), + }, }; } diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 0ed5292da12c4..bccee7f6b6010 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -60,11 +60,22 @@ interface MonitoringStats { capacity_estimation: { timestamp: string; value: { - minutes_to_drain_overdue: number; - max_throughput_per_minute: number; - min_required_kibana: number; - avg_required_throughput_per_minute: number; - avg_recurring_required_throughput_per_minute: number; + observed: { + observed_kibana_instances: number; + max_throughput_per_minute: number; + max_throughput_per_minute_per_kibana: number; + minutes_to_drain_overdue: number; + avg_required_throughput_per_minute: number; + avg_required_throughput_per_minute_per_kibana: number; + avg_recurring_required_throughput_per_minute: number; + avg_recurring_required_throughput_per_minute_per_kibana: number; + }; + proposed: { + min_required_kibana: number; + avg_recurring_required_throughput_per_minute_per_kibana: number; + avg_required_throughput_per_minute: number; + avg_required_throughput_per_minute_per_kibana: number; + }; }; }; }; @@ -174,16 +185,28 @@ export default function ({ getService }: FtrProviderContext) { it('should return a breakdown of idleTasks in the task manager workload', async () => { const { - capacity_estimation: { value: capacityEstimation }, + capacity_estimation: { + value: { observed, proposed }, + }, } = (await getHealth()).stats; - expect(typeof capacityEstimation.minutes_to_drain_overdue).to.eql('number'); - expect(typeof capacityEstimation.max_throughput_per_minute).to.eql('number'); - expect(typeof capacityEstimation.min_required_kibana).to.eql('number'); - expect(typeof capacityEstimation.avg_required_throughput_per_minute).to.eql('number'); - expect(typeof capacityEstimation.avg_recurring_required_throughput_per_minute).to.eql( + expect(typeof observed.observed_kibana_instances).to.eql('number'); + expect(typeof observed.max_throughput_per_minute).to.eql('number'); + expect(typeof observed.max_throughput_per_minute_per_kibana).to.eql('number'); + expect(typeof observed.minutes_to_drain_overdue).to.eql('number'); + expect(typeof observed.avg_required_throughput_per_minute).to.eql('number'); + expect(typeof observed.avg_required_throughput_per_minute_per_kibana).to.eql('number'); + expect(typeof observed.avg_recurring_required_throughput_per_minute).to.eql('number'); + expect(typeof observed.avg_recurring_required_throughput_per_minute_per_kibana).to.eql( + 'number' + ); + + expect(typeof proposed.min_required_kibana).to.eql('number'); + expect(typeof proposed.avg_recurring_required_throughput_per_minute_per_kibana).to.eql( 'number' ); + expect(typeof proposed.avg_required_throughput_per_minute).to.eql('number'); + expect(typeof proposed.avg_required_throughput_per_minute_per_kibana).to.eql('number'); }); it('should return an estimation of task manager capacity', async () => {