From f11486638a0e9de2277043f8e61e026473693d7c Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Mon, 23 Aug 2021 16:28:54 -0400 Subject: [PATCH 1/6] resolves https://github.com/elastic/kibana/issues/109095 --- .../task_manager/server/lib/log_health_metrics.ts | 2 +- x-pack/plugins/task_manager/server/plugin.ts | 8 ++++++++ x-pack/plugins/task_manager/server/routes/health.ts | 11 +++++++---- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts index e8511b1e8c71..d283666fc8bc 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts @@ -80,7 +80,7 @@ export function logHealthMetrics( // This is legacy support - we used to always show this logger.debug(message); if (logLevel !== LogLevel.Debug && lastLogLevel === LogLevel.Debug) { - logger.warn( + logger.debug( `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging` ); } diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 11746e2da284..35dec38b43df 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -95,6 +95,14 @@ export class TaskManagerPlugin this.config! ); + core.status.derivedStatus$.subscribe((status) => + this.logger.debug(`status core.status.derivedStatus now set to ${status.level}`) + ); + serviceStatus$.subscribe((status) => + this.logger.debug(`status serviceStatus now set to ${status.level}`) + ); + + // here is where the system status is updated core.status.set( combineLatest([core.status.derivedStatus$, serviceStatus$]).pipe( map(([derivedStatus, serviceStatus]) => diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index fe58ee3490af..30986ab56f96 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -15,7 +15,12 @@ import { import { Observable, Subject } from 'rxjs'; import { tap, map } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; -import { Logger, ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server'; +import { + Logger, + ServiceStatus, + ServiceStatusLevels, + ServiceStatusLevel, +} from '../../../../../src/core/server'; import { MonitoringStats, summarizeMonitoringStats, @@ -118,9 +123,7 @@ export function withServiceStatus( const level = monitoredHealth.status === HealthStatus.OK ? ServiceStatusLevels.available - : monitoredHealth.status === HealthStatus.Warning - ? ServiceStatusLevels.degraded - : ServiceStatusLevels.unavailable; + : ServiceStatusLevels.degraded; return [ monitoredHealth, { From 9bf04b9bb83b2d0d63008945678a300b385d6dea Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Tue, 24 Aug 2021 00:45:59 -0400 Subject: [PATCH 2/6] add logging --- .../server/lib/calculate_health_status.ts | 35 ++++++++----- .../server/lib/log_health_metrics.test.ts | 15 +++--- .../server/lib/log_health_metrics.ts | 2 +- .../monitoring/capacity_estimation.test.ts | 20 +++++++ .../server/monitoring/capacity_estimation.ts | 47 ++++++++++++++--- .../monitoring/monitoring_stats_stream.ts | 5 +- .../monitoring/task_run_statistics.test.ts | 16 +++--- .../server/monitoring/task_run_statistics.ts | 34 +++++++++--- .../task_manager/server/routes/health.test.ts | 52 +++++++++++-------- .../task_manager/server/routes/health.ts | 11 ++-- 10 files changed, 167 insertions(+), 70 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts b/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts index d0e99690066d..306ab8a50e31 100644 --- a/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts +++ b/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts @@ -9,10 +9,12 @@ import { isString } from 'lodash'; import { JsonValue } from '@kbn/utility-types'; import { HealthStatus, RawMonitoringStats } from '../monitoring'; import { TaskManagerConfig } from '../config'; +import { Logger } from '../../../../../src/core/server'; export function calculateHealthStatus( summarizedStats: RawMonitoringStats, - config: TaskManagerConfig + config: TaskManagerConfig, + logger: Logger ): HealthStatus { const now = Date.now(); @@ -23,18 +25,25 @@ export function calculateHealthStatus( // if "cold" health stats are any more stale than the configured refresh (+ a buffer), consider the system unhealthy const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5; - /** - * If the monitored stats aren't fresh, return a red status - */ - const healthStatus = - hasStatus(summarizedStats.stats, HealthStatus.Error) || - hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness) || - hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness) - ? HealthStatus.Error - : hasStatus(summarizedStats.stats, HealthStatus.Warning) - ? HealthStatus.Warning - : HealthStatus.OK; - return healthStatus; + if (hasStatus(summarizedStats.stats, HealthStatus.Error)) { + return HealthStatus.Error; + } + + if (hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness)) { + logger.debug('setting HealthStatus.Error because of expired hot timestamps'); + return HealthStatus.Error; + } + + if (hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)) { + logger.debug('setting HealthStatus.Error because of expired cold timestamps'); + return HealthStatus.Error; + } + + if (hasStatus(summarizedStats.stats, HealthStatus.Warning)) { + return HealthStatus.Warning; + } + + return HealthStatus.OK; } function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean { diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts index b8e3e78925df..28f9e02c4d4d 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts @@ -50,12 +50,15 @@ describe('logHealthMetrics', () => { (calculateHealthStatus as jest.Mock).mockImplementation(() => HealthStatus.Error); logHealthMetrics(health, logger, config); - expect((logger as jest.Mocked).warn.mock.calls[0][0] as string).toBe( - `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging` - ); - expect((logger as jest.Mocked).warn.mock.calls[1][0] as string).toBe( - `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging` - ); + const debugCalls = (logger as jest.Mocked).debug.mock.calls; + const performanceMessage = `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`; + const lastStatsMessage = /^Latest Monitored Stats: \{.*\}$/; + expect(debugCalls[0][0] as string).toMatch(lastStatsMessage); + expect(debugCalls[1][0] as string).toMatch(lastStatsMessage); + expect(debugCalls[2][0] as string).toBe(performanceMessage); + expect(debugCalls[3][0] as string).toMatch(lastStatsMessage); + expect(debugCalls[4][0] as string).toMatch(lastStatsMessage); + expect(debugCalls[5][0] as string).toBe(performanceMessage); }); it('should not log a warning message to enable verbose logging when the status goes from Warning to OK', () => { diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts index d283666fc8bc..1114b2861d3f 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts @@ -36,7 +36,7 @@ export function logHealthMetrics( capacity_estimation: undefined, }, }; - const statusWithoutCapacity = calculateHealthStatus(healthWithoutCapacity, config); + const statusWithoutCapacity = calculateHealthStatus(healthWithoutCapacity, config, logger); if (statusWithoutCapacity === HealthStatus.Warning) { logLevel = LogLevel.Warn; } else if (statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats)) { diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts index 5e2b075415a1..d6ae8024b92f 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts @@ -7,11 +7,19 @@ import { CapacityEstimationParams, estimateCapacity } from './capacity_estimation'; import { HealthStatus, RawMonitoringStats } from './monitoring_stats_stream'; +import { mockLogger } from '../test_utils'; describe('estimateCapacity', () => { + const logger = mockLogger(); + + beforeAll(() => { + jest.resetAllMocks(); + }); + test('estimates the max throughput per minute based on the workload and the assumed kibana instances', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -67,6 +75,7 @@ describe('estimateCapacity', () => { test('reduces the available capacity per kibana when average task duration exceeds the poll interval', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -124,6 +133,7 @@ describe('estimateCapacity', () => { test('estimates the max throughput per minute when duration by persistence is empty', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -160,6 +170,7 @@ describe('estimateCapacity', () => { test('estimates the max throughput per minute based on the workload and the assumed kibana instances when there are tasks that repeat each hour or day', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -215,6 +226,7 @@ describe('estimateCapacity', () => { test('estimates the max throughput available when there are no active Kibana', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -271,6 +283,7 @@ describe('estimateCapacity', () => { test('estimates the max throughput available to handle the workload when there are multiple active kibana instances', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -332,6 +345,7 @@ describe('estimateCapacity', () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -412,6 +426,7 @@ describe('estimateCapacity', () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -493,6 +508,7 @@ describe('estimateCapacity', () => { test('marks estimated capacity as OK state when workload and load suggest capacity is sufficient', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -557,6 +573,7 @@ describe('estimateCapacity', () => { test('marks estimated capacity as Warning state when capacity is insufficient for recent spikes of non-recurring workload, but sufficient for the recurring workload', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -618,6 +635,7 @@ describe('estimateCapacity', () => { test('marks estimated capacity as Error state when workload and load suggest capacity is insufficient', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -679,6 +697,7 @@ describe('estimateCapacity', () => { test('recommmends a 20% increase in kibana when a spike in non-recurring tasks forces recurring task capacity to zero', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { @@ -754,6 +773,7 @@ describe('estimateCapacity', () => { test('recommmends a 20% increase in kibana when a spike in non-recurring tasks in a system with insufficient capacity even for recurring tasks', async () => { expect( estimateCapacity( + logger, mockStats( { max_workers: 10, poll_interval: 3000 }, { diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts index 9cc223f63b19..49c593d77ace 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts @@ -12,6 +12,7 @@ import { RawMonitoringStats, RawMonitoredStat, HealthStatus } from './monitoring import { AveragedStat } from './task_run_calcultors'; import { TaskPersistenceTypes } from './task_run_statistics'; import { asErr, asOk, map, Result } from '../lib/result_type'; +import { Logger } from '../../../../../src/core/server'; export interface CapacityEstimationStat extends JsonObject { observed: { @@ -44,6 +45,7 @@ function isCapacityEstimationParams( } export function estimateCapacity( + logger: Logger, capacityStats: CapacityEstimationParams ): RawMonitoredStat { const workload = capacityStats.workload.value; @@ -183,13 +185,14 @@ export function estimateCapacity( const assumedRequiredThroughputPerMinutePerKibana = averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana + averageRecurringRequiredPerMinute / assumedKibanaInstances; + + const status = getHealthStatus(logger, { + assumedRequiredThroughputPerMinutePerKibana, + assumedAverageRecurringRequiredThroughputPerMinutePerKibana, + capacityPerMinutePerKibana, + }); return { - status: - assumedRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana - ? HealthStatus.OK - : assumedAverageRecurringRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana - ? HealthStatus.Warning - : HealthStatus.Error, + status, timestamp: new Date().toISOString(), value: { observed: mapValues( @@ -220,13 +223,43 @@ export function estimateCapacity( }; } +interface GetHealthStatusParams { + assumedRequiredThroughputPerMinutePerKibana: number; + assumedAverageRecurringRequiredThroughputPerMinutePerKibana: number; + capacityPerMinutePerKibana: number; +} + +function getHealthStatus(logger: Logger, params: GetHealthStatusParams): HealthStatus { + const { + assumedRequiredThroughputPerMinutePerKibana, + assumedAverageRecurringRequiredThroughputPerMinutePerKibana, + capacityPerMinutePerKibana, + } = params; + if (assumedRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) { + return HealthStatus.OK; + } + + if (assumedAverageRecurringRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) { + logger.debug( + `setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})` + ); + return HealthStatus.Warning; + } + + logger.debug( + `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})` + ); + return HealthStatus.Error; +} + export function withCapacityEstimate( + logger: Logger, monitoredStats: RawMonitoringStats['stats'] ): RawMonitoringStats['stats'] { if (isCapacityEstimationParams(monitoredStats)) { return { ...monitoredStats, - capacity_estimation: estimateCapacity(monitoredStats), + capacity_estimation: estimateCapacity(logger, monitoredStats), }; } return monitoredStats; diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts index fdddfc41e590..08badf8fe1c9 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -136,6 +136,7 @@ export function createMonitoringStatsStream( } export function summarizeMonitoringStats( + logger: Logger, { // eslint-disable-next-line @typescript-eslint/naming-convention last_update, @@ -143,7 +144,7 @@ export function summarizeMonitoringStats( }: MonitoringStats, config: TaskManagerConfig ): RawMonitoringStats { - const summarizedStats = withCapacityEstimate({ + const summarizedStats = withCapacityEstimate(logger, { ...(configuration ? { configuration: { @@ -156,7 +157,7 @@ export function summarizeMonitoringStats( ? { runtime: { timestamp: runtime.timestamp, - ...summarizeTaskRunStat(runtime.value, config), + ...summarizeTaskRunStat(logger, runtime.value, config), }, } : {}), diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index 46dc56b2bac4..32fe9bf60466 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -10,6 +10,7 @@ import { Subject, Observable } from 'rxjs'; import stats from 'stats-lite'; import sinon from 'sinon'; import { take, tap, bufferCount, skip, map } from 'rxjs/operators'; +import { mockLogger } from '../test_utils'; import { ConcreteTaskInstance, TaskStatus } from '../task'; import { @@ -36,9 +37,11 @@ import { configSchema } from '../config'; describe('Task Run Statistics', () => { let fakeTimer: sinon.SinonFakeTimers; + const logger = mockLogger(); beforeAll(() => { fakeTimer = sinon.useFakeTimers(); + jest.resetAllMocks(); }); afterAll(() => fakeTimer.restore()); @@ -77,7 +80,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig()).value, })), take(runAtDrift.length), bufferCount(runAtDrift.length) @@ -145,7 +148,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig()).value, })), take(runDurations.length * 2), bufferCount(runDurations.length * 2) @@ -241,7 +244,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig()).value, })), take(10), bufferCount(10) @@ -321,6 +324,7 @@ describe('Task Run Statistics', () => { map(({ key, value }: AggregatedStat) => ({ key, value: summarizeTaskRunStat( + logger, value, getTaskManagerConfig({ monitored_task_execution_thresholds: { @@ -449,7 +453,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig({})).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig({})).value, })), take(taskEvents.length), bufferCount(taskEvents.length) @@ -590,7 +594,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig({})).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig({})).value, })), take(taskEvents.length), bufferCount(taskEvents.length) @@ -707,7 +711,7 @@ describe('Task Run Statistics', () => { // Use 'summarizeTaskRunStat' to receive summarize stats map(({ key, value }: AggregatedStat) => ({ key, - value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + value: summarizeTaskRunStat(logger, value, getTaskManagerConfig()).value, })), tap(() => { expectedTimestamp.push(new Date().toISOString()); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index 3946827827fe..44908706aa6e 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -38,6 +38,7 @@ import { import { HealthStatus } from './monitoring_stats_stream'; import { TaskPollingLifecycle } from '../polling_lifecycle'; import { TaskExecutionFailureThreshold, TaskManagerConfig } from '../config'; +import { Logger } from '../../../../../src/core/server'; interface FillPoolStat extends JsonObject { duration: number[]; @@ -337,6 +338,7 @@ const DEFAULT_POLLING_FREQUENCIES = { }; export function summarizeTaskRunStat( + logger: Logger, { polling: { // eslint-disable-next-line @typescript-eslint/naming-convention @@ -403,6 +405,7 @@ export function summarizeTaskRunStat( executionResultFrequency, (typedResultFrequencies, taskType) => summarizeTaskExecutionResultFrequencyStat( + logger, { ...DEFAULT_TASK_RUN_FREQUENCIES, ...calculateFrequency(typedResultFrequencies), @@ -418,16 +421,35 @@ export function summarizeTaskRunStat( } function summarizeTaskExecutionResultFrequencyStat( + logger: Logger, resultFrequencySummary: ResultFrequency, executionErrorThreshold: TaskExecutionFailureThreshold ): ResultFrequencySummary { + const status = getHealthStatus(logger, resultFrequencySummary, executionErrorThreshold); return { ...resultFrequencySummary, - status: - resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold - ? resultFrequencySummary.Failed > executionErrorThreshold.error_threshold - ? HealthStatus.Error - : HealthStatus.Warning - : HealthStatus.OK, + status, }; } + +function getHealthStatus( + logger: Logger, + resultFrequencySummary: ResultFrequency, + executionErrorThreshold: TaskExecutionFailureThreshold +): HealthStatus { + if (resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold) { + if (resultFrequencySummary.Failed > executionErrorThreshold.error_threshold) { + logger.debug( + `setting HealthStatus.Error because resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) > error_threshold (${executionErrorThreshold.error_threshold})` + ); + return HealthStatus.Error; + } else { + logger.debug( + `setting HealthStatus.Warning because resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) > warn_threshold (${executionErrorThreshold.warn_threshold})` + ); + return HealthStatus.Warning; + } + } + + return HealthStatus.OK; +} diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index 01c1d6a5f398..c3f6461a35cb 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -20,7 +20,7 @@ import { RawMonitoringStats, summarizeMonitoringStats, } from '../monitoring'; -import { ServiceStatusLevels } from 'src/core/server'; +import { ServiceStatusLevels, Logger } from 'src/core/server'; import { configSchema, TaskManagerConfig } from '../config'; import { calculateHealthStatusMock } from '../lib/calculate_health_status.mock'; import { FillPoolResult } from '../lib/fill_pool'; @@ -30,14 +30,14 @@ jest.mock('../lib/log_health_metrics', () => ({ })); describe('healthRoute', () => { + const logger = loggingSystemMock.create().get(); + beforeEach(() => { jest.resetAllMocks(); }); it('registers the route', async () => { const router = httpServiceMock.createRouter(); - - const logger = loggingSystemMock.create().get(); healthRoute(router, of(), logger, uuid.v4(), getTaskManagerConfig()); const [config] = router.get.mock.calls[0]; @@ -47,7 +47,6 @@ describe('healthRoute', () => { it('logs the Task Manager stats at a fixed interval', async () => { const router = httpServiceMock.createRouter(); - const logger = loggingSystemMock.create().get(); const calculateHealthStatus = calculateHealthStatusMock.create(); calculateHealthStatus.mockImplementation(() => HealthStatus.OK); const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics'); @@ -87,19 +86,22 @@ describe('healthRoute', () => { id, timestamp: expect.any(String), status: expect.any(String), - ...ignoreCapacityEstimation(summarizeMonitoringStats(mockStat, getTaskManagerConfig({}))), + ...ignoreCapacityEstimation( + summarizeMonitoringStats(logger, mockStat, getTaskManagerConfig({})) + ), }); expect(logHealthMetrics.mock.calls[1][0]).toMatchObject({ id, timestamp: expect.any(String), status: expect.any(String), - ...ignoreCapacityEstimation(summarizeMonitoringStats(nextMockStat, getTaskManagerConfig({}))), + ...ignoreCapacityEstimation( + summarizeMonitoringStats(logger, nextMockStat, getTaskManagerConfig({})) + ), }); }); it(`logs at a warn level if the status is warning`, async () => { const router = httpServiceMock.createRouter(); - const logger = loggingSystemMock.create().get(); const calculateHealthStatus = calculateHealthStatusMock.create(); calculateHealthStatus.mockImplementation(() => HealthStatus.Warning); const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics'); @@ -141,7 +143,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(warnRuntimeStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, warnRuntimeStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[1][0]).toMatchObject({ @@ -149,7 +151,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(warnConfigurationStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, warnConfigurationStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[2][0]).toMatchObject({ @@ -157,7 +159,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(warnWorkloadStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, warnWorkloadStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[3][0]).toMatchObject({ @@ -165,14 +167,13 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(warnEphemeralStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, warnEphemeralStat, getTaskManagerConfig({})) ), }); }); it(`logs at an error level if the status is error`, async () => { const router = httpServiceMock.createRouter(); - const logger = loggingSystemMock.create().get(); const calculateHealthStatus = calculateHealthStatusMock.create(); calculateHealthStatus.mockImplementation(() => HealthStatus.Error); const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics'); @@ -214,7 +215,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(errorRuntimeStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, errorRuntimeStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[1][0]).toMatchObject({ @@ -222,7 +223,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(errorConfigurationStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, errorConfigurationStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[2][0]).toMatchObject({ @@ -230,7 +231,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(errorWorkloadStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, errorWorkloadStat, getTaskManagerConfig({})) ), }); expect(logHealthMetrics.mock.calls[3][0]).toMatchObject({ @@ -238,7 +239,7 @@ describe('healthRoute', () => { timestamp: expect.any(String), status: expect.any(String), ...ignoreCapacityEstimation( - summarizeMonitoringStats(errorEphemeralStat, getTaskManagerConfig({})) + summarizeMonitoringStats(logger, errorEphemeralStat, getTaskManagerConfig({})) ), }); }); @@ -251,7 +252,7 @@ describe('healthRoute', () => { const { serviceStatus$ } = healthRoute( router, stats$, - loggingSystemMock.create().get(), + logger, uuid.v4(), getTaskManagerConfig({ monitored_stats_required_freshness: 1000, @@ -278,6 +279,7 @@ describe('healthRoute', () => { status: 'error', ...ignoreCapacityEstimation( summarizeMonitoringStats( + logger, mockHealthStats({ last_update: expect.any(String), stats: { @@ -307,9 +309,15 @@ describe('healthRoute', () => { }); expect(await serviceStatus).toMatchObject({ - level: ServiceStatusLevels.unavailable, - summary: 'Task Manager is unavailable', + level: ServiceStatusLevels.degraded, + summary: 'Task Manager is unhealthy', }); + const debugCalls = (logger as jest.Mocked).debug.mock.calls as string[][]; + const warnMessage = /^setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana/; + const found = debugCalls + .map((arr) => arr[0]) + .find((message) => message.match(warnMessage) != null); + expect(found).toMatch(warnMessage); }); it('returns a error status if the workload stats have not been updated within the required cold freshness', async () => { @@ -320,7 +328,7 @@ describe('healthRoute', () => { healthRoute( router, stats$, - loggingSystemMock.create().get(), + logger, uuid.v4(), getTaskManagerConfig({ monitored_stats_required_freshness: 5000, @@ -352,6 +360,7 @@ describe('healthRoute', () => { status: 'error', ...ignoreCapacityEstimation( summarizeMonitoringStats( + logger, mockHealthStats({ last_update: expect.any(String), stats: { @@ -388,7 +397,7 @@ describe('healthRoute', () => { healthRoute( router, stats$, - loggingSystemMock.create().get(), + logger, uuid.v4(), getTaskManagerConfig({ monitored_stats_required_freshness: 1000, @@ -423,6 +432,7 @@ describe('healthRoute', () => { status: 'error', ...ignoreCapacityEstimation( summarizeMonitoringStats( + logger, mockHealthStats({ last_update: expect.any(String), stats: { diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts index 30986ab56f96..410166218443 100644 --- a/x-pack/plugins/task_manager/server/routes/health.ts +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -15,12 +15,7 @@ import { import { Observable, Subject } from 'rxjs'; import { tap, map } from 'rxjs/operators'; import { throttleTime } from 'rxjs/operators'; -import { - Logger, - ServiceStatus, - ServiceStatusLevels, - ServiceStatusLevel, -} from '../../../../../src/core/server'; +import { Logger, ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server'; import { MonitoringStats, summarizeMonitoringStats, @@ -67,8 +62,8 @@ export function healthRoute( const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness; function getHealthStatus(monitoredStats: MonitoringStats) { - const summarizedStats = summarizeMonitoringStats(monitoredStats, config); - const status = calculateHealthStatus(summarizedStats, config); + const summarizedStats = summarizeMonitoringStats(logger, monitoredStats, config); + const status = calculateHealthStatus(summarizedStats, config, logger); const now = Date.now(); const timestamp = new Date(now).toISOString(); return { id: taskManagerId, timestamp, status, ...summarizedStats }; From dcc5962e42769313b866eb2a172176099bfd0f44 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Tue, 24 Aug 2021 01:35:54 -0400 Subject: [PATCH 3/6] change hot refresh config to allow more slop when checking for problems --- .../task_manager/server/lib/calculate_health_status.ts | 9 +++++---- x-pack/plugins/task_manager/server/routes/health.test.ts | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts b/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts index 306ab8a50e31..b544674d61c0 100644 --- a/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts +++ b/x-pack/plugins/task_manager/server/lib/calculate_health_status.ts @@ -18,11 +18,12 @@ export function calculateHealthStatus( ): HealthStatus { const now = Date.now(); - // if "hot" health stats are any more stale than monitored_stats_required_freshness (pollInterval +1s buffer by default) - // consider the system unhealthy - const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness; + // if "hot" health stats are any more stale than monitored_stats_required_freshness + // times a multiplier, consider the system unhealthy + const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness * 3; - // if "cold" health stats are any more stale than the configured refresh (+ a buffer), consider the system unhealthy + // if "cold" health stats are any more stale than the configured refresh + // times a multiplier, consider the system unhealthy const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5; if (hasStatus(summarizedStats.stats, HealthStatus.Error)) { diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index c3f6461a35cb..f34728cd8ff3 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -270,7 +270,7 @@ describe('healthRoute', () => { stats$.next( mockHealthStats({ - last_update: new Date(Date.now() - 1500).toISOString(), + last_update: new Date(Date.now() - 3001).toISOString(), }) ); @@ -408,7 +408,7 @@ describe('healthRoute', () => { await sleep(0); // eslint-disable-next-line @typescript-eslint/naming-convention - const last_successful_poll = new Date(Date.now() - 2000).toISOString(); + const last_successful_poll = new Date(Date.now() - 3001).toISOString(); stats$.next( mockHealthStats({ stats: { From 7ebbfdd3811da66550a17df176e091e8e19dfc87 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Tue, 24 Aug 2021 12:43:31 -0400 Subject: [PATCH 4/6] update Detected potential performance message --- .../task_manager/server/lib/log_health_metrics.test.ts | 6 +++--- .../plugins/task_manager/server/lib/log_health_metrics.ts | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts index 28f9e02c4d4d..4e17d64870a3 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts @@ -51,14 +51,14 @@ describe('logHealthMetrics', () => { logHealthMetrics(health, logger, config); const debugCalls = (logger as jest.Mocked).debug.mock.calls; - const performanceMessage = `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`; + const performanceMessage = /^Task Manager detected a degradation in performance/; const lastStatsMessage = /^Latest Monitored Stats: \{.*\}$/; expect(debugCalls[0][0] as string).toMatch(lastStatsMessage); expect(debugCalls[1][0] as string).toMatch(lastStatsMessage); - expect(debugCalls[2][0] as string).toBe(performanceMessage); + expect(debugCalls[2][0] as string).toMatch(performanceMessage); expect(debugCalls[3][0] as string).toMatch(lastStatsMessage); expect(debugCalls[4][0] as string).toMatch(lastStatsMessage); - expect(debugCalls[5][0] as string).toBe(performanceMessage); + expect(debugCalls[5][0] as string).toMatch(performanceMessage); }); it('should not log a warning message to enable verbose logging when the status goes from Warning to OK', () => { diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts index 1114b2861d3f..8f9c676f5ea3 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts @@ -44,6 +44,7 @@ export function logHealthMetrics( } const message = `Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`; + const detectedProblemMessage = `Task Manager detected a degradation in performance. This is usually temporary, and Kibana can recover automatically. If the problem persists, check the docs for troubleshooting information: https://www.elastic.co/guide/en/kibana/current/task-manager-health-monitoring.html .`; if (enabled) { const driftInSeconds = (monitoredHealth.stats.runtime?.value.drift.p99 ?? 0) / 1000; if ( @@ -80,9 +81,7 @@ export function logHealthMetrics( // This is legacy support - we used to always show this logger.debug(message); if (logLevel !== LogLevel.Debug && lastLogLevel === LogLevel.Debug) { - logger.debug( - `Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging` - ); + logger.debug(detectedProblemMessage); } } From e8c9b0849dbb3293b9912febed9a73786feff2f6 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Wed, 25 Aug 2021 08:46:38 -0400 Subject: [PATCH 5/6] use branch version in docLink in logged message --- x-pack/plugins/task_manager/server/lib/log_health_metrics.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts index 8f9c676f5ea3..d541ffb5684d 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts @@ -5,6 +5,8 @@ * 2.0. */ +import { kibanaPackageJson } from '@kbn/utils'; + import { isEmpty } from 'lodash'; import { Logger } from '../../../../../src/core/server'; import { HealthStatus } from '../monitoring'; @@ -44,7 +46,8 @@ export function logHealthMetrics( } const message = `Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`; - const detectedProblemMessage = `Task Manager detected a degradation in performance. This is usually temporary, and Kibana can recover automatically. If the problem persists, check the docs for troubleshooting information: https://www.elastic.co/guide/en/kibana/current/task-manager-health-monitoring.html .`; + const docLink = `https://www.elastic.co/guide/en/kibana/${kibanaPackageJson.branch}/task-manager-health-monitoring.html`; + const detectedProblemMessage = `Task Manager detected a degradation in performance. This is usually temporary, and Kibana can recover automatically. If the problem persists, check the docs for troubleshooting information: ${docLink} .`; if (enabled) { const driftInSeconds = (monitoredHealth.stats.runtime?.value.drift.p99 ?? 0) / 1000; if ( From 73e12c716ee0128cbe14c5d7f0b5493bb6d21091 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Sun, 29 Aug 2021 22:02:12 -0400 Subject: [PATCH 6/6] change alerting plugin usage of unavailable status to degraded status --- .../plugins/alerting/server/health/get_state.test.ts | 8 ++++---- x-pack/plugins/alerting/server/health/get_state.ts | 10 ++++------ x-pack/plugins/alerting/server/plugin.ts | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/x-pack/plugins/alerting/server/health/get_state.test.ts b/x-pack/plugins/alerting/server/health/get_state.test.ts index b58a12794188..9429dcc07d92 100644 --- a/x-pack/plugins/alerting/server/health/get_state.test.ts +++ b/x-pack/plugins/alerting/server/health/get_state.test.ts @@ -216,8 +216,8 @@ describe('getHealthServiceStatusWithRetryAndErrorHandling', () => { }) ).toPromise(); - expect(status.level).toEqual(ServiceStatusLevels.unavailable); - expect(status.summary).toEqual('Alerting framework is unavailable'); + expect(status.level).toEqual(ServiceStatusLevels.degraded); + expect(status.summary).toEqual('Alerting framework is degraded'); expect(status.meta).toBeUndefined(); }); @@ -275,8 +275,8 @@ describe('getHealthServiceStatusWithRetryAndErrorHandling', () => { }), retryDelay ).subscribe((status) => { - expect(status.level).toEqual(ServiceStatusLevels.unavailable); - expect(status.summary).toEqual('Alerting framework is unavailable'); + expect(status.level).toEqual(ServiceStatusLevels.degraded); + expect(status.summary).toEqual('Alerting framework is degraded'); expect(status.meta).toEqual({ error: err }); }); diff --git a/x-pack/plugins/alerting/server/health/get_state.ts b/x-pack/plugins/alerting/server/health/get_state.ts index 255037d7015a..34f897ad5b73 100644 --- a/x-pack/plugins/alerting/server/health/get_state.ts +++ b/x-pack/plugins/alerting/server/health/get_state.ts @@ -73,9 +73,7 @@ const getHealthServiceStatus = async ( const level = doc.state?.health_status === HealthStatus.OK ? ServiceStatusLevels.available - : doc.state?.health_status === HealthStatus.Warning - ? ServiceStatusLevels.degraded - : ServiceStatusLevels.unavailable; + : ServiceStatusLevels.degraded; return { level, summary: LEVEL_SUMMARY[level.toString()], @@ -102,10 +100,10 @@ export const getHealthServiceStatusWithRetryAndErrorHandling = ( ); }), catchError((error) => { - logger.warn(`Alerting framework is unavailable due to the error: ${error}`); + logger.warn(`Alerting framework is degraded due to the error: ${error}`); return of({ - level: ServiceStatusLevels.unavailable, - summary: LEVEL_SUMMARY[ServiceStatusLevels.unavailable.toString()], + level: ServiceStatusLevels.degraded, + summary: LEVEL_SUMMARY[ServiceStatusLevels.degraded.toString()], meta: { error }, }); }) diff --git a/x-pack/plugins/alerting/server/plugin.ts b/x-pack/plugins/alerting/server/plugin.ts index 17361a361955..bb42beba6e23 100644 --- a/x-pack/plugins/alerting/server/plugin.ts +++ b/x-pack/plugins/alerting/server/plugin.ts @@ -240,7 +240,7 @@ export class AlertingPlugin { ); const serviceStatus$ = new BehaviorSubject({ - level: ServiceStatusLevels.unavailable, + level: ServiceStatusLevels.degraded, summary: 'Alerting is initializing', }); core.status.set(serviceStatus$);