From 3df635ef4a8c86c41c91ac5f59198a9b67d1dc8b Mon Sep 17 00:00:00 2001 From: Ryland Herrick Date: Fri, 12 Jul 2024 14:10:25 -0500 Subject: [PATCH] [Detection Engine] Addresses Flakiness in ML FTR tests (#188155) ## Summary The full chronicle of this endeavor can be found [here](https://github.com/elastic/kibana/pull/182183), but [this comment](https://github.com/elastic/kibana/pull/182183#issuecomment-2221517519) summarizes the identified issue: > I [finally found](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6516#01909dde-a3e8-4e47-b255-b1ff7cac8f8d/6-2368) the cause of these failures in the response to our "setup modules" request to ML. Attaching here for posterity: > >
> Setup Modules Failure Response > > ```json > { > "jobs": [ > { "id": "v3_linux_anomalous_network_port_activity", "success": true }, > { > "id": "v3_linux_anomalous_network_activity", > "success": false, > "error": { > "error": { > "root_cause": [ > { > "type": "no_shard_available_action_exception", > "reason": "[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]" > } > ], > "type": "search_phase_execution_exception", > "reason": "all shards failed", > "phase": "query", > "grouped": true, > "failed_shards": [ > { > "shard": 0, > "index": ".ml-anomalies-custom-v3_linux_network_configuration_discovery", > "node": "dKzpvp06ScO0OxqHilETEA", > "reason": { > "type": "no_shard_available_action_exception", > "reason": "[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]" > } > } > ] > }, > "status": 503 > } > } > ], > "datafeeds": [ > { > "id": "datafeed-v3_linux_anomalous_network_port_activity", > "success": true, > "started": false, > "awaitingMlNodeAllocation": false > }, > { > "id": "datafeed-v3_linux_anomalous_network_activity", > "success": false, > "started": false, > "awaitingMlNodeAllocation": false, > "error": { > "error": { > "root_cause": [ > { > "type": "resource_not_found_exception", > "reason": "No known job with id 'v3_linux_anomalous_network_activity'" > } > ], > "type": "resource_not_found_exception", > "reason": "No known job with id 'v3_linux_anomalous_network_activity'" > }, > "status": 404 > } > } > ], > "kibana": {} > } > > ``` >
This branch, then, fixes said issue by (relatively simply) retrying the failed API call until it succeeds. ### Related Issues Addresses: - https://github.com/elastic/kibana/issues/171426 - https://github.com/elastic/kibana/issues/187478 - https://github.com/elastic/kibana/issues/187614 - https://github.com/elastic/kibana/issues/182009 - https://github.com/elastic/kibana/issues/171426 ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [x] [Flaky Test Runner](https://ci-stats.kibana.dev/trigger_flaky_test_runner/1) was used on any tests changed - [x] [ESS Rule Execution FTR x 200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6528) - [x] [Serverless Rule Execution FTR x 200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6529) ### For maintainers - [x] This was checked for breaking API changes and was [labeled appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process) --- .../execution_logic/machine_learning.ts | 7 ++-- .../machine_learning_alert_suppression.ts | 11 +++-- .../machine_learning_setup.ts | 40 ++++++++++++++++++- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts index 5d73249e576f4..be6464baff393 100644 --- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts @@ -40,12 +40,12 @@ import { importFile, } from '../../../../../lists_and_exception_lists/utils'; import { - executeSetupModuleRequest, forceStartDatafeeds, getAlerts, getPreviewAlerts, previewRule, previewRuleWithExceptionEntries, + setupMlModulesWithRetry, } from '../../../../utils'; import { createRule, @@ -86,13 +86,12 @@ export default ({ getService }: FtrProviderContext) => { rule_id: 'ml-rule-id', }; - // FLAKY: https://github.com/elastic/kibana/issues/171426 - describe.skip('@ess @serverless @serverlessQA Machine learning type rules', () => { + describe('@ess @serverless @serverlessQA Machine learning type rules', () => { before(async () => { // Order is critical here: auditbeat data must be loaded before attempting to start the ML job, // as the job looks for certain indices on start await esArchiver.load(auditPath); - await executeSetupModuleRequest({ module: siemModule, rspCode: 200, supertest }); + await setupMlModulesWithRetry({ module: siemModule, supertest, retry }); await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest }); await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies'); }); diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts index dc26aa96348a6..d8869681de692 100644 --- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts @@ -27,7 +27,6 @@ import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder' import { FtrProviderContext } from '../../../../../../ftr_provider_context'; import { dataGeneratorFactory, - executeSetupModuleRequest, forceStartDatafeeds, getAlerts, getOpenAlerts, @@ -36,6 +35,7 @@ import { previewRule, previewRuleWithExceptionEntries, setAlertStatus, + setupMlModulesWithRetry, } from '../../../../utils'; import { createRule, @@ -51,6 +51,7 @@ export default ({ getService }: FtrProviderContext) => { const es = getService('es'); const log = getService('log'); const config = getService('config'); + const retry = getService('retry'); const isServerless = config.get('serverless'); const dataPathBuilder = new EsArchivePathBuilder(isServerless); @@ -93,7 +94,7 @@ export default ({ getService }: FtrProviderContext) => { // Order is critical here: auditbeat data must be loaded before attempting to start the ML job, // as the job looks for certain indices on start await esArchiver.load(auditbeatArchivePath); - await executeSetupModuleRequest({ module: mlModuleName, rspCode: 200, supertest }); + await setupMlModulesWithRetry({ module: mlModuleName, retry, supertest }); await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest }); await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies'); await deleteAllAnomalies(log, es); @@ -112,8 +113,7 @@ export default ({ getService }: FtrProviderContext) => { await deleteAllAnomalies(log, es); }); - // FLAKY: https://github.com/elastic/kibana/issues/187478 - describe.skip('with per-execution suppression duration', () => { + describe('with per-execution suppression duration', () => { beforeEach(() => { ruleProps = { ...baseRuleProps, @@ -245,8 +245,7 @@ export default ({ getService }: FtrProviderContext) => { }); }); - // FLAKY: https://github.com/elastic/kibana/issues/187614 - describe.skip('with interval suppression duration', () => { + describe('with interval suppression duration', () => { beforeEach(() => { ruleProps = { ...baseRuleProps, diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts index fa0c6fa4f78b5..bd8214e63e4d1 100644 --- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts @@ -6,9 +6,18 @@ */ import type SuperTest from 'supertest'; +import { RetryService } from '@kbn/ftr-common-functional-services'; import { ML_GROUP_ID } from '@kbn/security-solution-plugin/common/constants'; import { getCommonRequestHeader } from '../../../../../functional/services/ml/common_api'; +interface ModuleJob { + id: string; + success: boolean; + error?: { + status: number; + }; +} + export const executeSetupModuleRequest = async ({ module, rspCode, @@ -17,7 +26,7 @@ export const executeSetupModuleRequest = async ({ module: string; rspCode: number; supertest: SuperTest.Agent; -}) => { +}): Promise<{ jobs: ModuleJob[] }> => { const { body } = await supertest .post(`/internal/ml/modules/setup/${module}`) .set(getCommonRequestHeader('1')) @@ -34,6 +43,35 @@ export const executeSetupModuleRequest = async ({ return body; }; +export const setupMlModulesWithRetry = async ({ + module, + retry, + supertest, +}: { + module: string; + retry: RetryService; + supertest: SuperTest.Agent; +}) => + retry.try(async () => { + const response = await executeSetupModuleRequest({ + module, + rspCode: 200, + supertest, + }); + + const allJobsSucceeded = response?.jobs.every((job) => { + return job.success || (job.error?.status && job.error.status < 500); + }); + + if (!allJobsSucceeded) { + throw new Error( + `Expected all jobs to set up successfully, but got ${JSON.stringify(response)}` + ); + } + + return response; + }); + export const forceStartDatafeeds = async ({ jobId, rspCode,