From 669e1d0660e5269784aebc0ca55ea1ab383e62bd Mon Sep 17 00:00:00 2001
From: Ryland Herrick <ryalnd@gmail.com>
Date: Fri, 12 Jul 2024 14:10:25 -0500
Subject: [PATCH] [Detection Engine] Addresses Flakiness in ML FTR tests
 (#188155)

## Summary

The full chronicle of this endeavor can be found
[here](https://github.com/elastic/kibana/pull/182183), but [this
comment](https://github.com/elastic/kibana/pull/182183#issuecomment-2221517519)
summarizes the identified issue:

> I [finally
found](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6516#01909dde-a3e8-4e47-b255-b1ff7cac8f8d/6-2368)
the cause of these failures in the response to our "setup modules"
request to ML. Attaching here for posterity:
>
> <details>
> <summary>Setup Modules Failure Response</summary>
>
> ```json
> {
>   "jobs": [
> { "id": "v3_linux_anomalous_network_port_activity", "success": true },
>     {
>       "id": "v3_linux_anomalous_network_activity",
>       "success": false,
>       "error": {
>         "error": {
>           "root_cause": [
>             {
>               "type": "no_shard_available_action_exception",
> "reason":
"[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]"
>             }
>           ],
>           "type": "search_phase_execution_exception",
>           "reason": "all shards failed",
>           "phase": "query",
>           "grouped": true,
>           "failed_shards": [
>             {
>               "shard": 0,
> "index":
".ml-anomalies-custom-v3_linux_network_configuration_discovery",
>               "node": "dKzpvp06ScO0OxqHilETEA",
>               "reason": {
>                 "type": "no_shard_available_action_exception",
> "reason":
"[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]"
>               }
>             }
>           ]
>         },
>         "status": 503
>       }
>     }
>   ],
>   "datafeeds": [
>     {
>       "id": "datafeed-v3_linux_anomalous_network_port_activity",
>       "success": true,
>       "started": false,
>       "awaitingMlNodeAllocation": false
>     },
>     {
>       "id": "datafeed-v3_linux_anomalous_network_activity",
>       "success": false,
>       "started": false,
>       "awaitingMlNodeAllocation": false,
>       "error": {
>         "error": {
>           "root_cause": [
>             {
>               "type": "resource_not_found_exception",
> "reason": "No known job with id 'v3_linux_anomalous_network_activity'"
>             }
>           ],
>           "type": "resource_not_found_exception",
> "reason": "No known job with id 'v3_linux_anomalous_network_activity'"
>         },
>         "status": 404
>       }
>     }
>   ],
>   "kibana": {}
> }
>
> ```
> </details>

This branch, then, fixes said issue by (relatively simply) retrying the
failed API call until it succeeds.

### Related Issues
Addresses:
- https://github.com/elastic/kibana/issues/171426
- https://github.com/elastic/kibana/issues/187478
- https://github.com/elastic/kibana/issues/187614
- https://github.com/elastic/kibana/issues/182009
- https://github.com/elastic/kibana/issues/171426

### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios
- [x] [Flaky Test
Runner](https://ci-stats.kibana.dev/trigger_flaky_test_runner/1) was
used on any tests changed
- [x] [ESS Rule Execution FTR x
200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6528)
- [x] [Serverless Rule Execution FTR x
200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6529)

### For maintainers

- [x] This was checked for breaking API changes and was [labeled
appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)

(cherry picked from commit 3df635ef4a8c86c41c91ac5f59198a9b67d1dc8b)
---
 .../execution_logic/machine_learning.ts       |  7 ++--
 .../machine_learning_alert_suppression.ts     |  5 ++-
 .../machine_learning_setup.ts                 | 40 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts
index 5d73249e576f4..be6464baff393 100644
--- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts
+++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts
@@ -40,12 +40,12 @@ import {
   importFile,
 } from '../../../../../lists_and_exception_lists/utils';
 import {
-  executeSetupModuleRequest,
   forceStartDatafeeds,
   getAlerts,
   getPreviewAlerts,
   previewRule,
   previewRuleWithExceptionEntries,
+  setupMlModulesWithRetry,
 } from '../../../../utils';
 import {
   createRule,
@@ -86,13 +86,12 @@ export default ({ getService }: FtrProviderContext) => {
     rule_id: 'ml-rule-id',
   };
 
-  // FLAKY: https://github.com/elastic/kibana/issues/171426
-  describe.skip('@ess @serverless @serverlessQA Machine learning type rules', () => {
+  describe('@ess @serverless @serverlessQA Machine learning type rules', () => {
     before(async () => {
       // Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
       // as the job looks for certain indices on start
       await esArchiver.load(auditPath);
-      await executeSetupModuleRequest({ module: siemModule, rspCode: 200, supertest });
+      await setupMlModulesWithRetry({ module: siemModule, supertest, retry });
       await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
       await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
     });
diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts
index 2674c224a3100..84085dff514be 100644
--- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts
+++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning_alert_suppression.ts
@@ -27,7 +27,6 @@ import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder'
 import { FtrProviderContext } from '../../../../../../ftr_provider_context';
 import {
   dataGeneratorFactory,
-  executeSetupModuleRequest,
   forceStartDatafeeds,
   getAlerts,
   getOpenAlerts,
@@ -36,6 +35,7 @@ import {
   previewRule,
   previewRuleWithExceptionEntries,
   setAlertStatus,
+  setupMlModulesWithRetry,
 } from '../../../../utils';
 import {
   createRule,
@@ -51,6 +51,7 @@ export default ({ getService }: FtrProviderContext) => {
   const es = getService('es');
   const log = getService('log');
   const config = getService('config');
+  const retry = getService('retry');
 
   const isServerless = config.get('serverless');
   const dataPathBuilder = new EsArchivePathBuilder(isServerless);
@@ -94,7 +95,7 @@ export default ({ getService }: FtrProviderContext) => {
         // Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
         // as the job looks for certain indices on start
         await esArchiver.load(auditbeatArchivePath);
-        await executeSetupModuleRequest({ module: mlModuleName, rspCode: 200, supertest });
+        await setupMlModulesWithRetry({ module: mlModuleName, retry, supertest });
         await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
         await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
         await deleteAllAnomalies(log, es);
diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts
index fa0c6fa4f78b5..bd8214e63e4d1 100644
--- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts
+++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/utils/machine_learning/machine_learning_setup.ts
@@ -6,9 +6,18 @@
  */
 
 import type SuperTest from 'supertest';
+import { RetryService } from '@kbn/ftr-common-functional-services';
 import { ML_GROUP_ID } from '@kbn/security-solution-plugin/common/constants';
 import { getCommonRequestHeader } from '../../../../../functional/services/ml/common_api';
 
+interface ModuleJob {
+  id: string;
+  success: boolean;
+  error?: {
+    status: number;
+  };
+}
+
 export const executeSetupModuleRequest = async ({
   module,
   rspCode,
@@ -17,7 +26,7 @@ export const executeSetupModuleRequest = async ({
   module: string;
   rspCode: number;
   supertest: SuperTest.Agent;
-}) => {
+}): Promise<{ jobs: ModuleJob[] }> => {
   const { body } = await supertest
     .post(`/internal/ml/modules/setup/${module}`)
     .set(getCommonRequestHeader('1'))
@@ -34,6 +43,35 @@ export const executeSetupModuleRequest = async ({
   return body;
 };
 
+export const setupMlModulesWithRetry = async ({
+  module,
+  retry,
+  supertest,
+}: {
+  module: string;
+  retry: RetryService;
+  supertest: SuperTest.Agent;
+}) =>
+  retry.try(async () => {
+    const response = await executeSetupModuleRequest({
+      module,
+      rspCode: 200,
+      supertest,
+    });
+
+    const allJobsSucceeded = response?.jobs.every((job) => {
+      return job.success || (job.error?.status && job.error.status < 500);
+    });
+
+    if (!allJobsSucceeded) {
+      throw new Error(
+        `Expected all jobs to set up successfully, but got ${JSON.stringify(response)}`
+      );
+    }
+
+    return response;
+  });
+
 export const forceStartDatafeeds = async ({
   jobId,
   rspCode,