diff --git a/x-pack/packages/kbn-elastic-assistant-common/constants.ts b/x-pack/packages/kbn-elastic-assistant-common/constants.ts
index a25c935139b24..d84d9d4cd6825 100755
--- a/x-pack/packages/kbn-elastic-assistant-common/constants.ts
+++ b/x-pack/packages/kbn-elastic-assistant-common/constants.ts
@@ -50,3 +50,6 @@ export const ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL_FIND =
   `${ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL}/_find` as const;
 export const ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL_BULK_ACTION =
   `${ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL}/_bulk_action` as const;
+
+export const ELASTIC_AI_ASSISTANT_EVALUATE_URL =
+  `${ELASTIC_AI_ASSISTANT_INTERNAL_URL}/evaluate` as const;
diff --git a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml
index e4a1e1ce0f7b9..aa9f2aa8d879d 100644
--- a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml
+++ b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml
@@ -1,7 +1,7 @@
 openapi: 3.0.0
 info:
   title: Execute Connector API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/actions/connector/{connectorId}/_execute:
     post:
@@ -103,4 +103,3 @@ paths:
                     type: string
                   message:
                     type: string
-
diff --git a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts
index c0920c34a6b2a..6040e2bf9704d 100644
--- a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts
+++ b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts
@@ -18,5 +18,6 @@ import { z } from 'zod';
 
 export type GetEvaluateResponse = z.infer<typeof GetEvaluateResponse>;
 export const GetEvaluateResponse = z.object({
-  agentExecutors: z.array(z.string()),
+  datasets: z.array(z.string()),
+  graphs: z.array(z.string()),
 });
diff --git a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml
index deccfb9f2488c..01a93b6952a8c 100644
--- a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml
+++ b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml
@@ -1,32 +1,37 @@
 openapi: 3.0.0
 info:
   title: Get Evaluate API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/evaluate:
     get:
       x-codegen-enabled: true
       x-labels: [ess, serverless]
       operationId: GetEvaluate
-      description: Get relevant data for performing an evaluation like available sample data, agents, and evaluators
+      description: Get relevant data for performing an evaluation like available sample data, graphs, and evaluators
       summary: Get relevant data for performing an evaluation
       tags:
         - Evaluation API
       responses:
-        '200':
+        "200":
           description: Successful response
           content:
             application/json:
               schema:
                 type: object
                 properties:
-                  agentExecutors:
+                  datasets:
+                    type: array
+                    items:
+                      type: string
+                  graphs:
                     type: array
                     items:
                       type: string
                 required:
-                  - agentExecutors
-        '400':
+                  - datasets
+                  - graphs
+        "400":
           description: Generic Error
           content:
             application/json:
diff --git a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts
index f405d211d6d56..eff5240968ce5 100644
--- a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts
+++ b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts
@@ -16,63 +16,19 @@
 
 import { z } from 'zod';
 
-export type OutputIndex = z.infer<typeof OutputIndex>;
-export const OutputIndex = z.string().regex(/^.kibana-elastic-ai-assistant-/);
-
-export type DatasetItem = z.infer<typeof DatasetItem>;
-export const DatasetItem = z.object({
-  id: z.string().optional(),
-  input: z.string(),
-  prediction: z.string().optional(),
-  reference: z.string(),
-  tags: z.array(z.string()).optional(),
-});
-
-export type Dataset = z.infer<typeof Dataset>;
-export const Dataset = z.array(DatasetItem).default([]);
+import { Replacements } from '../conversations/common_attributes.gen';
 
 export type PostEvaluateBody = z.infer<typeof PostEvaluateBody>;
 export const PostEvaluateBody = z.object({
-  dataset: Dataset.optional(),
-  evalPrompt: z.string().optional(),
-});
-
-export type PostEvaluateRequestQuery = z.infer<typeof PostEvaluateRequestQuery>;
-export const PostEvaluateRequestQuery = z.object({
-  /**
-   * Agents parameter description
-   */
-  agents: z.string(),
-  /**
-   * Dataset Name parameter description
-   */
-  datasetName: z.string().optional(),
-  /**
-   * Evaluation Type parameter description
-   */
-  evaluationType: z.string().optional(),
-  /**
-   * Eval Model parameter description
-   */
-  evalModel: z.string().optional(),
-  /**
-   * Models parameter description
-   */
-  models: z.string(),
-  /**
-   * Output Index parameter description
-   */
-  outputIndex: OutputIndex,
-  /**
-   * Project Name parameter description
-   */
-  projectName: z.string().optional(),
-  /**
-   * Run Name parameter description
-   */
+  graphs: z.array(z.string()),
+  datasetName: z.string(),
+  connectorIds: z.array(z.string()),
   runName: z.string().optional(),
+  alertsIndexPattern: z.string().optional().default('.alerts-security.alerts-default'),
+  langSmithApiKey: z.string().optional(),
+  replacements: Replacements.optional().default({}),
+  size: z.number().optional().default(20),
 });
-export type PostEvaluateRequestQueryInput = z.input<typeof PostEvaluateRequestQuery>;
 
 export type PostEvaluateRequestBody = z.infer<typeof PostEvaluateRequestBody>;
 export const PostEvaluateRequestBody = PostEvaluateBody;
diff --git a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml
index 4b567f9cd118a..d0bec37344165 100644
--- a/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml
+++ b/x-pack/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml
@@ -1,14 +1,14 @@
 openapi: 3.0.0
 info:
   title: Post Evaluate API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/evaluate:
     post:
       x-codegen-enabled: true
       x-labels: [ess, serverless]
       operationId: PostEvaluate
-      description: Perform an evaluation using sample data against a combination of Agents and Connectors
+      description: Perform an evaluation using sample data against a combination of Graphs and Connectors
       summary: Performs an evaluation of the Elastic Assistant
       tags:
         - Evaluation API
@@ -17,53 +17,9 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/PostEvaluateBody'
-      parameters:
-        - name: agents
-          in: query
-          description: Agents parameter description
-          required: true
-          schema:
-            type: string
-        - name: datasetName
-          in: query
-          description: Dataset Name parameter description
-          schema:
-            type: string
-        - name: evaluationType
-          in: query
-          description: Evaluation Type parameter description
-          schema:
-            type: string
-        - name: evalModel
-          in: query
-          description: Eval Model parameter description
-          schema:
-            type: string
-        - name: models
-          in: query
-          description: Models parameter description
-          required: true
-          schema:
-            type: string
-        - name: outputIndex
-          in: query
-          description: Output Index parameter description
-          required: true
-          schema:
-            $ref: '#/components/schemas/OutputIndex'
-        - name: projectName
-          in: query
-          description: Project Name parameter description
-          schema:
-            type: string
-        - name: runName
-          in: query
-          description: Run Name parameter description
-          schema:
-            type: string
+              $ref: "#/components/schemas/PostEvaluateBody"
       responses:
-        '200':
+        "200":
           description: Successful response
           content:
             application/json:
@@ -77,7 +33,7 @@ paths:
                 required:
                   - evaluationId
                   - success
-        '400':
+        "400":
           description: Generic Error
           content:
             application/json:
@@ -92,36 +48,33 @@ paths:
                     type: string
 components:
   schemas:
-    OutputIndex:
-      type: string
-      pattern: '^.kibana-elastic-ai-assistant-'
-    DatasetItem:
+    PostEvaluateBody:
       type: object
+      required:
+        - graphs
+        - datasetName
+        - connectorIds
       properties:
-        id:
-          type: string
-        input:
-          type: string
-        prediction:
-          type: string
-        reference:
+        graphs:
+          type: array
+          items:
+            type: string
+        datasetName:
           type: string
-        tags:
+        connectorIds:
           type: array
           items:
             type: string
-      required:
-        - input
-        - reference
-    Dataset:
-      type: array
-      items:
-        $ref: '#/components/schemas/DatasetItem'
-      default: []
-    PostEvaluateBody:
-      type: object
-      properties:
-        dataset:
-          $ref: '#/components/schemas/Dataset'
-        evalPrompt:
+        runName:
+          type: string
+        alertsIndexPattern:
+          type: string
+          default: ".alerts-security.alerts-default"
+        langSmithApiKey:
           type: string
+        replacements:
+          $ref: "../conversations/common_attributes.schema.yaml#/components/schemas/Replacements"
+          default: {}
+        size:
+          type: number
+          default: 20
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx
index d25953370e97a..2320eceb0271e 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx
@@ -7,7 +7,7 @@
 
 import { postEvaluation } from './evaluate';
 import { HttpSetup } from '@kbn/core-http-browser';
-import { API_VERSIONS } from '@kbn/elastic-assistant-common';
+import { API_VERSIONS, PostEvaluateRequestBodyInput } from '@kbn/elastic-assistant-common';
 
 jest.mock('@kbn/core-http-browser');
 
@@ -16,39 +16,26 @@ const mockHttp = {
 } as unknown as HttpSetup;
 
 describe('postEvaluation', () => {
-  it('calls the knowledge base API when correct resource path', async () => {
+  const evalParams: PostEvaluateRequestBodyInput = {
+    graphs: ['not', 'alphabetical'],
+    datasetName: 'Test Dataset',
+    runName: 'Test Run Name',
+    connectorIds: ['not', 'alphabetical'],
+  };
+
+  it('calls the evaluate API when correct resource path', async () => {
     (mockHttp.post as jest.Mock).mockResolvedValue({ success: true });
+
     const testProps = {
       http: mockHttp,
-      evalParams: {
-        agents: ['not', 'alphabetical'],
-        dataset: '{}',
-        datasetName: 'Test Dataset',
-        projectName: 'Test Project Name',
-        runName: 'Test Run Name',
-        evalModel: ['not', 'alphabetical'],
-        evalPrompt: 'evalPrompt',
-        evaluationType: ['not', 'alphabetical'],
-        models: ['not', 'alphabetical'],
-        outputIndex: 'outputIndex',
-      },
+      evalParams,
     };
 
     await postEvaluation(testProps);
 
     expect(mockHttp.post).toHaveBeenCalledWith('/internal/elastic_assistant/evaluate', {
-      body: '{"dataset":{},"evalPrompt":"evalPrompt"}',
+      body: '{"graphs":["not","alphabetical"],"datasetName":"Test Dataset","runName":"Test Run Name","connectorIds":["not","alphabetical"]}',
       headers: { 'Content-Type': 'application/json' },
-      query: {
-        models: 'alphabetical,not',
-        agents: 'alphabetical,not',
-        datasetName: 'Test Dataset',
-        evaluationType: 'alphabetical,not',
-        evalModel: 'alphabetical,not',
-        outputIndex: 'outputIndex',
-        projectName: 'Test Project Name',
-        runName: 'Test Run Name',
-      },
       signal: undefined,
       version: API_VERSIONS.internal.v1,
     });
@@ -59,11 +46,12 @@ describe('postEvaluation', () => {
       throw new Error(error);
     });
 
-    const knowledgeBaseArgs = {
+    const evaluationArgs = {
       resource: 'a-resource',
       http: mockHttp,
+      evalParams,
     };
 
-    await expect(postEvaluation(knowledgeBaseArgs)).resolves.toThrowError('simulated error');
+    await expect(postEvaluation(evaluationArgs)).rejects.toThrowError('simulated error');
   });
 });
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.tsx
index 6581e22e77921..864d6af374ce3 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.tsx
@@ -8,14 +8,15 @@
 import { HttpSetup, IHttpFetchError } from '@kbn/core-http-browser';
 import {
   API_VERSIONS,
+  ELASTIC_AI_ASSISTANT_EVALUATE_URL,
   GetEvaluateResponse,
+  PostEvaluateRequestBodyInput,
   PostEvaluateResponse,
 } from '@kbn/elastic-assistant-common';
-import { PerformEvaluationParams } from './use_perform_evaluation';
 
 export interface PostEvaluationParams {
   http: HttpSetup;
-  evalParams?: PerformEvaluationParams;
+  evalParams: PostEvaluateRequestBodyInput;
   signal?: AbortSignal | undefined;
 }
 
@@ -33,35 +34,15 @@ export const postEvaluation = async ({
   http,
   evalParams,
   signal,
-}: PostEvaluationParams): Promise<PostEvaluateResponse | IHttpFetchError> => {
-  try {
-    const path = `/internal/elastic_assistant/evaluate`;
-    const query = {
-      agents: evalParams?.agents.sort()?.join(','),
-      datasetName: evalParams?.datasetName,
-      evaluationType: evalParams?.evaluationType.sort()?.join(','),
-      evalModel: evalParams?.evalModel.sort()?.join(','),
-      outputIndex: evalParams?.outputIndex,
-      models: evalParams?.models.sort()?.join(','),
-      projectName: evalParams?.projectName,
-      runName: evalParams?.runName,
-    };
-
-    return await http.post<PostEvaluateResponse>(path, {
-      body: JSON.stringify({
-        dataset: JSON.parse(evalParams?.dataset ?? '[]'),
-        evalPrompt: evalParams?.evalPrompt ?? '',
-      }),
-      headers: {
-        'Content-Type': 'application/json',
-      },
-      query,
-      signal,
-      version: API_VERSIONS.internal.v1,
-    });
-  } catch (error) {
-    return error as IHttpFetchError;
-  }
+}: PostEvaluationParams): Promise<PostEvaluateResponse> => {
+  return http.post<PostEvaluateResponse>(ELASTIC_AI_ASSISTANT_EVALUATE_URL, {
+    body: JSON.stringify(evalParams),
+    headers: {
+      'Content-Type': 'application/json',
+    },
+    signal,
+    version: API_VERSIONS.internal.v1,
+  });
 };
 
 export interface GetEvaluationParams {
@@ -83,9 +64,7 @@ export const getEvaluation = async ({
   signal,
 }: GetEvaluationParams): Promise<GetEvaluateResponse | IHttpFetchError> => {
   try {
-    const path = `/internal/elastic_assistant/evaluate`;
-
-    return await http.get<GetEvaluateResponse>(path, {
+    return await http.get<GetEvaluateResponse>(ELASTIC_AI_ASSISTANT_EVALUATE_URL, {
       signal,
       version: API_VERSIONS.internal.v1,
     });
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.test.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.test.tsx
index f9fdb2e80b7b2..6d1296fc9aa64 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.test.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.test.tsx
@@ -9,7 +9,7 @@ import { act, renderHook } from '@testing-library/react-hooks';
 import { usePerformEvaluation, UsePerformEvaluationParams } from './use_perform_evaluation';
 import { postEvaluation as _postEvaluation } from './evaluate';
 import { useMutation as _useMutation } from '@tanstack/react-query';
-import { API_VERSIONS } from '@kbn/elastic-assistant-common';
+import { API_VERSIONS, PostEvaluateRequestBodyInput } from '@kbn/elastic-assistant-common';
 
 const useMutationMock = _useMutation as jest.Mock;
 const postEvaluationMock = _postEvaluation as jest.Mock;
@@ -55,20 +55,10 @@ describe('usePerformEvaluation', () => {
       await waitForNextUpdate();
 
       expect(defaultProps.http.post).toHaveBeenCalledWith('/internal/elastic_assistant/evaluate', {
-        body: '{"dataset":[],"evalPrompt":""}',
+        body: undefined,
         headers: {
           'Content-Type': 'application/json',
         },
-        query: {
-          agents: undefined,
-          datasetName: undefined,
-          evalModel: undefined,
-          evaluationType: undefined,
-          models: undefined,
-          outputIndex: undefined,
-          projectName: undefined,
-          runName: undefined,
-        },
         signal: undefined,
         version: API_VERSIONS.internal.v1,
       });
@@ -78,17 +68,13 @@ describe('usePerformEvaluation', () => {
   it('Correctly passes and formats evalParams', async () => {
     useMutationMock.mockImplementation(async (queryKey, fn, opts) => {
       try {
-        const res = await fn({
-          agents: ['d', 'c'],
-          dataset: '["kewl"]',
-          evalModel: ['b', 'a'],
-          evalPrompt: 'evalPrompt',
-          evaluationType: ['f', 'e'],
-          models: ['h', 'g'],
-          outputIndex: 'outputIndex',
-          projectName: 'test project',
+        const evalParams: PostEvaluateRequestBodyInput = {
+          graphs: ['d', 'c'],
+          datasetName: 'kewl',
+          connectorIds: ['h', 'g'],
           runName: 'test run',
-        });
+        };
+        const res = await fn(evalParams);
         return Promise.resolve(res);
       } catch (e) {
         opts.onError(e);
@@ -99,20 +85,10 @@ describe('usePerformEvaluation', () => {
       await waitForNextUpdate();
 
       expect(defaultProps.http.post).toHaveBeenCalledWith('/internal/elastic_assistant/evaluate', {
-        body: '{"dataset":["kewl"],"evalPrompt":"evalPrompt"}',
+        body: '{"graphs":["d","c"],"datasetName":"kewl","connectorIds":["h","g"],"runName":"test run"}',
         headers: {
           'Content-Type': 'application/json',
         },
-        query: {
-          agents: 'c,d',
-          datasetName: undefined,
-          evalModel: 'a,b',
-          evaluationType: 'e,f',
-          models: 'g,h',
-          outputIndex: 'outputIndex',
-          projectName: 'test project',
-          runName: 'test run',
-        },
         signal: undefined,
         version: API_VERSIONS.internal.v1,
       });
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.tsx
index 30e95d9d80407..fc803bb0ec50d 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/use_perform_evaluation.tsx
@@ -6,9 +6,10 @@
  */
 
 import { useMutation } from '@tanstack/react-query';
-import type { HttpSetup, IHttpFetchError, ResponseErrorBody } from '@kbn/core-http-browser';
+import type { HttpSetup, IHttpFetchError } from '@kbn/core-http-browser';
 import type { IToasts } from '@kbn/core-notifications-browser';
 import { i18n } from '@kbn/i18n';
+import { PostEvaluateRequestBodyInput } from '@kbn/elastic-assistant-common';
 import { postEvaluation } from './evaluate';
 
 const PERFORM_EVALUATION_MUTATION_KEY = ['elastic-assistant', 'perform-evaluation'];
@@ -18,17 +19,12 @@ export interface UsePerformEvaluationParams {
   toasts?: IToasts;
 }
 
-export interface PerformEvaluationParams {
-  agents: string[];
-  dataset: string | undefined;
-  datasetName: string | undefined;
-  evalModel: string[];
-  evalPrompt: string;
-  evaluationType: string[];
-  models: string[];
-  outputIndex: string;
-  projectName: string | undefined;
-  runName: string | undefined;
+export interface ResponseError {
+  statusCode: number;
+  success: boolean;
+  message: {
+    error: string;
+  };
 }
 
 /**
@@ -43,15 +39,14 @@ export interface PerformEvaluationParams {
 export const usePerformEvaluation = ({ http, toasts }: UsePerformEvaluationParams) => {
   return useMutation(
     PERFORM_EVALUATION_MUTATION_KEY,
-    (evalParams?: PerformEvaluationParams | void) => {
-      // Optional params workaround: see: https://github.com/TanStack/query/issues/1077#issuecomment-1431247266
-      return postEvaluation({ http, evalParams: evalParams ?? undefined });
+    (evalParams: PostEvaluateRequestBodyInput) => {
+      return postEvaluation({ http, evalParams });
     },
     {
-      onError: (error: IHttpFetchError<ResponseErrorBody>) => {
+      onError: (error: IHttpFetchError<ResponseError>) => {
         if (error.name !== 'AbortError') {
           toasts?.addError(
-            error.body && error.body.message ? new Error(error.body.message) : error,
+            error?.body?.message?.error ? new Error(error.body.message.error) : error,
             {
               title: i18n.translate('xpack.elasticAssistant.evaluation.evaluationError', {
                 defaultMessage: 'Error performing evaluation...',
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
index e6ce69f65abab..a06c28ed360f5 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/evaluation_settings.tsx
@@ -17,7 +17,6 @@ import {
   EuiComboBox,
   EuiButton,
   EuiComboBoxOptionOption,
-  EuiTextArea,
   EuiTextColor,
   EuiFieldText,
   EuiFlexItem,
@@ -28,52 +27,33 @@ import {
 
 import { css } from '@emotion/react';
 import { FormattedMessage } from '@kbn/i18n-react';
-import type { GetEvaluateResponse, PostEvaluateResponse } from '@kbn/elastic-assistant-common';
+import type {
+  GetEvaluateResponse,
+  PostEvaluateRequestBodyInput,
+} from '@kbn/elastic-assistant-common';
 import * as i18n from './translations';
 import { useAssistantContext } from '../../../assistant_context';
 import { useLoadConnectors } from '../../../connectorland/use_load_connectors';
 import { getActionTypeTitle, getGenAiConfig } from '../../../connectorland/helpers';
 import { PRECONFIGURED_CONNECTOR } from '../../../connectorland/translations';
 import { usePerformEvaluation } from '../../api/evaluate/use_perform_evaluation';
-import { getApmLink, getDiscoverLink } from './utils';
 import { useEvaluationData } from '../../api/evaluate/use_evaluation_data';
 
-const DEFAULT_EVAL_TYPES_OPTIONS = [
-  { label: 'correctness' },
-  { label: 'esql-validator', disabled: true },
-  { label: 'custom', disabled: true },
-];
-const DEFAULT_OUTPUT_INDEX = '.kibana-elastic-ai-assistant-evaluation-results';
-
 /**
  * Evaluation Settings -- development-only feature for evaluating models
  */
 export const EvaluationSettings: React.FC = React.memo(() => {
-  const { actionTypeRegistry, basePath, http, setTraceOptions, traceOptions } =
-    useAssistantContext();
+  const { actionTypeRegistry, http, setTraceOptions, toasts, traceOptions } = useAssistantContext();
   const { data: connectors } = useLoadConnectors({ http });
-  const {
-    data: evalResponse,
-    mutate: performEvaluation,
-    isLoading: isPerformingEvaluation,
-  } = usePerformEvaluation({
+  const { mutate: performEvaluation, isLoading: isPerformingEvaluation } = usePerformEvaluation({
     http,
+    toasts,
   });
   const { data: evalData } = useEvaluationData({ http });
-  const defaultAgents = useMemo(
-    () => (evalData as GetEvaluateResponse)?.agentExecutors ?? [],
-    [evalData]
-  );
+  const defaultGraphs = useMemo(() => (evalData as GetEvaluateResponse)?.graphs ?? [], [evalData]);
+  const datasets = useMemo(() => (evalData as GetEvaluateResponse)?.datasets ?? [], [evalData]);
 
   // Run Details
-  // Project Name
-  const [projectName, setProjectName] = useState();
-  const onProjectNameChange = useCallback(
-    (e) => {
-      setProjectName(e.target.value);
-    },
-    [setProjectName]
-  );
   // Run Name
   const [runName, setRunName] = useState();
   const onRunNameChange = useCallback(
@@ -82,14 +62,6 @@ export const EvaluationSettings: React.FC = React.memo(() => {
     },
     [setRunName]
   );
-  // Local Output Index
-  const [outputIndex, setOutputIndex] = useState(DEFAULT_OUTPUT_INDEX);
-  const onOutputIndexChange = useCallback(
-    (e) => {
-      setOutputIndex(e.target.value);
-    },
-    [setOutputIndex]
-  );
   /** Trace Options **/
   const [showTraceOptions, setShowTraceOptions] = useState(false);
   const onApmUrlChange = useCallback(
@@ -111,56 +83,17 @@ export const EvaluationSettings: React.FC = React.memo(() => {
     [setTraceOptions, traceOptions]
   );
   /** Dataset **/
-  const [useLangSmithDataset, setUseLangSmithDataset] = useState(true);
-  const datasetToggleButton = useMemo(() => {
-    return (
-      <EuiText
-        size={'xs'}
-        css={css`
-          margin-top: 16px;
-        `}
-      >
-        {i18n.EVALUATOR_DATASET_LABEL}
-        {' ('}
-        <EuiLink
-          color={useLangSmithDataset ? 'primary' : 'text'}
-          onClick={() => setUseLangSmithDataset(true)}
-        >
-          {i18n.LANGSMITH_DATASET_LABEL}
-        </EuiLink>
-        {' / '}
-        <EuiLink
-          color={useLangSmithDataset ? 'text' : 'primary'}
-          onClick={() => setUseLangSmithDataset(false)}
-        >
-          {i18n.CUSTOM_DATASET_LABEL}
-        </EuiLink>
-        {')'}
-      </EuiText>
-    );
-  }, [useLangSmithDataset]);
-  const [datasetName, setDatasetName] = useState<string>();
-  const onDatasetNameChange = useCallback(
-    (e) => {
-      setDatasetName(e.target.value);
-    },
-    [setDatasetName]
-  );
-  const sampleDataset = [
-    {
-      input:
-        'As an expert user of Elastic Security, please generate an accurate and valid ESQL query to detect the use case below. Your response should be formatted to be able to use immediately in an Elastic Security timeline or detection rule. Take your time with the answer, and really make sure you check your knowledge really well on all the functions I am asking for. check it multiple times if you need to. I cannot afford for queries to be inaccurate. Assume I am using the Elastic Common Schema. Ensure the answers are formatted in a way which is easily copyable.\n\n' +
-        'Write an ESQL query for detecting cryptomining activity on an AWS EC2 instance.',
-      reference:
-        'FROM metrics-apm*\n| WHERE metricset.name == ""transaction"" AND metricset.interval == ""1m""\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket',
-    },
-  ];
-  const [datasetText, setDatasetText] = useState<string>(JSON.stringify(sampleDataset, null, 2));
-  const onDatasetTextChange = useCallback(
-    (e) => {
-      setDatasetText(e.target.value);
+  const [selectedDatasetOptions, setSelectedDatasetOptions] = useState<
+    Array<EuiComboBoxOptionOption<string>>
+  >([]);
+  const datasetOptions = useMemo(() => {
+    return datasets.map((label) => ({ label }));
+  }, [datasets]);
+  const onDatasetOptionsChange = useCallback(
+    (selectedOptions: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedDatasetOptions(selectedOptions);
     },
-    [setDatasetText]
+    [setSelectedDatasetOptions]
   );
 
   // Predictions
@@ -191,17 +124,17 @@ export const EvaluationSettings: React.FC = React.memo(() => {
     );
   }, [actionTypeRegistry, connectors, visColorsBehindText]);
 
-  // Agents
-  const [selectedAgentOptions, setSelectedAgentOptions] = useState<
+  // Graphs
+  const [selectedGraphOptions, setSelectedGraphOptions] = useState<
     Array<EuiComboBoxOptionOption<string>>
   >([]);
-  const onAgentOptionsChange = useCallback(
-    (agentOptions: Array<EuiComboBoxOptionOption<string>>) => {
-      setSelectedAgentOptions(agentOptions);
+  const onGraphOptionsChange = useCallback(
+    (graphOptions: Array<EuiComboBoxOptionOption<string>>) => {
+      setSelectedGraphOptions(graphOptions);
     },
-    [setSelectedAgentOptions]
+    [setSelectedGraphOptions]
   );
-  const onAgentOptionsCreate = useCallback(
+  const onGraphOptionsCreate = useCallback(
     (searchValue: string) => {
       const normalizedSearchValue = searchValue.trim();
 
@@ -209,108 +142,35 @@ export const EvaluationSettings: React.FC = React.memo(() => {
         return;
       }
 
-      setSelectedAgentOptions([...selectedAgentOptions, { label: normalizedSearchValue }]);
+      setSelectedGraphOptions([...selectedGraphOptions, { label: normalizedSearchValue }]);
     },
-    [selectedAgentOptions]
-  );
-  const agentOptions = useMemo(() => {
-    return defaultAgents.map((label) => ({ label }));
-  }, [defaultAgents]);
-
-  // Evaluation
-  // Evaluation Type
-  const [selectedEvaluationType, setSelectedEvaluationType] = useState<
-    Array<EuiComboBoxOptionOption<string>>
-  >([]);
-  const onEvaluationTypeChange = useCallback(
-    (evaluationType: Array<EuiComboBoxOptionOption<string>>) => {
-      setSelectedEvaluationType(evaluationType);
-    },
-    [setSelectedEvaluationType]
-  );
-  const onEvaluationTypeOptionsCreate = useCallback(
-    (searchValue: string) => {
-      const normalizedSearchValue = searchValue.trim();
-
-      if (!normalizedSearchValue) {
-        return;
-      }
-
-      setSelectedEvaluationType([{ label: normalizedSearchValue }]);
-    },
-    [setSelectedEvaluationType]
-  );
-  const evaluationTypeOptions = useMemo(() => {
-    return DEFAULT_EVAL_TYPES_OPTIONS;
-  }, []);
-
-  // Eval Model
-  const [selectedEvaluatorModelOptions, setSelectedEvaluatorModelOptions] = useState<
-    Array<EuiComboBoxOptionOption<string>>
-  >([]);
-  const onEvaluatorModelOptionsChange = useCallback(
-    (selectedOptions: Array<EuiComboBoxOptionOption<string>>) => {
-      setSelectedEvaluatorModelOptions(selectedOptions);
-    },
-    [setSelectedEvaluatorModelOptions]
-  );
-
-  // Eval Prompt
-  const sampleEvalPrompt: string = `For the below input: \n\n{{input}} \n\na prediction: \n\n{{prediction}} \n\nwas made. How's it stack up against this reference: \n\n{{reference}} \n\nReturn output in a succinct sentence ranking on a simple grading rubric focused on correctness.`;
-  const [evalPrompt, setEvalPrompt] = useState<string>(sampleEvalPrompt);
-  const onEvalPromptChange = useCallback(
-    (e) => {
-      setEvalPrompt(e.target.value);
-    },
-    [setEvalPrompt]
+    [selectedGraphOptions]
   );
+  const graphOptions = useMemo(() => {
+    return defaultGraphs.map((label) => ({ label }));
+  }, [defaultGraphs]);
 
   // Required fields by eval API
   const isPerformEvaluationDisabled =
-    selectedModelOptions.length === 0 ||
-    selectedAgentOptions.length === 0 ||
-    outputIndex.length === 0;
+    selectedModelOptions.length === 0 || selectedGraphOptions.length === 0;
 
   // Perform Evaluation Button
   const handlePerformEvaluation = useCallback(async () => {
-    const evalParams = {
-      models: selectedModelOptions.flatMap((option) => option.key ?? []),
-      agents: selectedAgentOptions.map((option) => option.label),
-      dataset: useLangSmithDataset ? undefined : datasetText,
-      datasetName: useLangSmithDataset ? datasetName : undefined,
-      evalModel: selectedEvaluatorModelOptions.flatMap((option) => option.key ?? []),
-      evalPrompt,
-      evaluationType: selectedEvaluationType.map((option) => option.label),
-      outputIndex,
-      projectName,
+    const evalParams: PostEvaluateRequestBodyInput = {
+      connectorIds: selectedModelOptions.flatMap((option) => option.key ?? []).sort(),
+      graphs: selectedGraphOptions.map((option) => option.label).sort(),
+      datasetName: selectedDatasetOptions[0]?.label,
       runName,
     };
     performEvaluation(evalParams);
   }, [
-    datasetName,
-    datasetText,
-    evalPrompt,
-    outputIndex,
     performEvaluation,
-    projectName,
     runName,
-    selectedAgentOptions,
-    selectedEvaluationType,
-    selectedEvaluatorModelOptions,
+    selectedDatasetOptions,
+    selectedGraphOptions,
     selectedModelOptions,
-    useLangSmithDataset,
   ]);
 
-  const discoverLink = useMemo(
-    () => getDiscoverLink(basePath, (evalResponse as PostEvaluateResponse)?.evaluationId ?? ''),
-    [basePath, evalResponse]
-  );
-
-  const apmLink = useMemo(
-    () => getApmLink(basePath, (evalResponse as PostEvaluateResponse)?.evaluationId ?? ''),
-    [basePath, evalResponse]
-  );
-
   const getSection = (title: string, description: string) => (
     <div>
       <EuiFlexGroup gutterSize="s" alignItems="center" responsive={false}>
@@ -337,10 +197,6 @@ export const EvaluationSettings: React.FC = React.memo(() => {
     () => getSection(i18n.PREDICTION_DETAILS_TITLE, i18n.PREDICTION_DETAILS_DESCRIPTION),
     []
   );
-  const evalDetailsSection = useMemo(
-    () => getSection(i18n.EVALUATION_DETAILS_TITLE, i18n.EVALUATION_DETAILS_DESCRIPTION),
-    []
-  );
 
   const buttonCss = css`
     &:hover {
@@ -362,79 +218,33 @@ export const EvaluationSettings: React.FC = React.memo(() => {
         initialIsOpen={true}
         paddingSize="s"
       >
-        <EuiFlexGroup>
-          <EuiFlexItem>
-            <EuiFormRow
-              display="rowCompressed"
-              label={i18n.PROJECT_LABEL}
-              helpText={i18n.PROJECT_DESCRIPTION}
-            >
-              <EuiFieldText
-                aria-label="project-textfield"
-                compressed
-                onChange={onProjectNameChange}
-                placeholder={i18n.PROJECT_PLACEHOLDER}
-                value={projectName}
-              />
-            </EuiFormRow>
-          </EuiFlexItem>
-          <EuiFlexItem>
-            <EuiFormRow
-              display="rowCompressed"
-              label={i18n.RUN_NAME_LABEL}
-              helpText={i18n.RUN_NAME_DESCRIPTION}
-            >
-              <EuiFieldText
-                aria-label="run-name-textfield"
-                compressed
-                onChange={onRunNameChange}
-                placeholder={i18n.RUN_NAME_PLACEHOLDER}
-                value={runName}
-              />
-            </EuiFormRow>
-          </EuiFlexItem>
-        </EuiFlexGroup>
         <EuiFormRow
           display="rowCompressed"
-          label={datasetToggleButton}
-          fullWidth
-          helpText={
-            useLangSmithDataset
-              ? i18n.LANGSMITH_DATASET_DESCRIPTION
-              : i18n.CUSTOM_DATASET_DESCRIPTION
-          }
+          label={i18n.RUN_NAME_LABEL}
+          helpText={i18n.RUN_NAME_DESCRIPTION}
         >
-          {useLangSmithDataset ? (
-            <EuiFieldText
-              aria-label="dataset-name-textfield"
-              compressed
-              onChange={onDatasetNameChange}
-              placeholder={i18n.LANGSMITH_DATASET_PLACEHOLDER}
-              value={datasetName}
-            />
-          ) : (
-            <EuiTextArea
-              aria-label={'evaluation-dataset-textarea'}
-              compressed
-              css={css`
-                min-height: 300px;
-              `}
-              fullWidth
-              onChange={onDatasetTextChange}
-              value={datasetText}
-            />
-          )}
+          <EuiFieldText
+            aria-label={i18n.RUN_NAME_LABEL}
+            compressed
+            onChange={onRunNameChange}
+            placeholder={i18n.RUN_NAME_PLACEHOLDER}
+            value={runName}
+          />
         </EuiFormRow>
         <EuiFormRow
           display="rowCompressed"
-          label={i18n.EVALUATOR_OUTPUT_INDEX_LABEL}
+          label={i18n.EVALUATOR_DATASET_LABEL}
           fullWidth
-          helpText={i18n.EVALUATOR_OUTPUT_INDEX_DESCRIPTION}
+          helpText={i18n.LANGSMITH_DATASET_DESCRIPTION}
         >
-          <EuiFieldText
-            value={outputIndex}
-            onChange={onOutputIndexChange}
-            aria-label="evaluation-output-index-textfield"
+          <EuiComboBox
+            aria-label={i18n.EVALUATOR_DATASET_LABEL}
+            placeholder={i18n.LANGSMITH_DATASET_PLACEHOLDER}
+            singleSelection={{ asPlainText: true }}
+            options={datasetOptions}
+            selectedOptions={selectedDatasetOptions}
+            onChange={onDatasetOptionsChange}
+            compressed={true}
           />
         </EuiFormRow>
         <EuiText
@@ -461,7 +271,7 @@ export const EvaluationSettings: React.FC = React.memo(() => {
               <EuiFieldText
                 value={traceOptions.apmUrl}
                 onChange={onApmUrlChange}
-                aria-label="apm-url-textfield"
+                aria-label={i18n.APM_URL_LABEL}
               />
             </EuiFormRow>
             <EuiFormRow
@@ -473,7 +283,7 @@ export const EvaluationSettings: React.FC = React.memo(() => {
               <EuiFieldText
                 value={traceOptions.langSmithProject}
                 onChange={onLangSmithProjectChange}
-                aria-label="langsmith-project-textfield"
+                aria-label={i18n.LANGSMITH_PROJECT_LABEL}
               />
             </EuiFormRow>
             <EuiFormRow
@@ -485,7 +295,7 @@ export const EvaluationSettings: React.FC = React.memo(() => {
               <EuiFieldText
                 value={traceOptions.langSmithApiKey}
                 onChange={onLangSmithApiKeyChange}
-                aria-label="langsmith-api-key-textfield"
+                aria-label={i18n.LANGSMITH_API_KEY_LABEL}
               />
             </EuiFormRow>
           </>
@@ -508,7 +318,7 @@ export const EvaluationSettings: React.FC = React.memo(() => {
           helpText={i18n.CONNECTORS_DESCRIPTION}
         >
           <EuiComboBox
-            aria-label={'model-selector'}
+            aria-label={i18n.CONNECTORS_LABEL}
             compressed
             options={modelOptions}
             selectedOptions={selectedModelOptions}
@@ -518,80 +328,20 @@ export const EvaluationSettings: React.FC = React.memo(() => {
 
         <EuiFormRow
           display="rowCompressed"
-          label={i18n.AGENTS_LABEL}
-          helpText={i18n.AGENTS_DESCRIPTION}
+          label={i18n.GRAPHS_LABEL}
+          helpText={i18n.GRAPHS_DESCRIPTION}
         >
           <EuiComboBox
-            aria-label={'agent-selector'}
+            aria-label={i18n.GRAPHS_LABEL}
             compressed
-            onCreateOption={onAgentOptionsCreate}
-            options={agentOptions}
-            selectedOptions={selectedAgentOptions}
-            onChange={onAgentOptionsChange}
+            onCreateOption={onGraphOptionsCreate}
+            options={graphOptions}
+            selectedOptions={selectedGraphOptions}
+            onChange={onGraphOptionsChange}
           />
         </EuiFormRow>
       </EuiAccordion>
       <EuiHorizontalRule margin={'s'} />
-      {/* Evaluation Details*/}
-      <EuiAccordion
-        id={i18n.EVALUATION_DETAILS_TITLE}
-        arrowDisplay={'right'}
-        element="fieldset"
-        buttonProps={{ paddingSize: 's', css: buttonCss }}
-        buttonContent={evalDetailsSection}
-        paddingSize="s"
-      >
-        <EuiFormRow
-          display="rowCompressed"
-          label={i18n.EVALUATOR_MODEL_LABEL}
-          helpText={i18n.EVALUATOR_MODEL_DESCRIPTION}
-        >
-          <EuiComboBox
-            aria-label={'evaluation-type-select'}
-            compressed
-            options={modelOptions}
-            selectedOptions={selectedEvaluatorModelOptions}
-            singleSelection={{ asPlainText: true }}
-            onChange={onEvaluatorModelOptionsChange}
-          />
-        </EuiFormRow>
-
-        <EuiFormRow
-          display="rowCompressed"
-          label={i18n.EVALUATION_TYPE_LABEL}
-          helpText={i18n.EVALUATION_TYPE_DESCRIPTION}
-        >
-          <EuiComboBox
-            aria-label={'evaluation-type-select'}
-            compressed
-            onChange={onEvaluationTypeChange}
-            onCreateOption={onEvaluationTypeOptionsCreate}
-            options={evaluationTypeOptions}
-            selectedOptions={selectedEvaluationType}
-            singleSelection={{ asPlainText: true }}
-          />
-        </EuiFormRow>
-
-        <EuiFormRow
-          display="rowCompressed"
-          label={i18n.EVALUATION_PROMPT_LABEL}
-          fullWidth
-          helpText={i18n.EVALUATION_PROMPT_DESCRIPTION}
-        >
-          <EuiTextArea
-            aria-label={'evaluation-prompt-textarea'}
-            compressed
-            css={css`
-              min-height: 330px;
-            `}
-            disabled={selectedEvaluationType[0]?.label !== 'custom'}
-            fullWidth
-            onChange={onEvalPromptChange}
-            value={evalPrompt}
-          />
-        </EuiFormRow>
-      </EuiAccordion>
-      <EuiHorizontalRule />
       <EuiFlexGroup alignItems="center">
         <EuiFlexItem grow={false}>
           <EuiButton
@@ -608,20 +358,8 @@ export const EvaluationSettings: React.FC = React.memo(() => {
         <EuiFlexItem>
           <EuiText color={'subdued'} size={'xs'}>
             <FormattedMessage
-              defaultMessage="Closing this dialog will cancel the evaluation. You can watch the Kibana server logs for progress, and view results in {discover} {apm}. Can take many minutes for large datasets."
+              defaultMessage="Closing this dialog will cancel the evaluation. You can watch the Kibana server logs for progress. Can take many minutes for large datasets."
               id="xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText"
-              values={{
-                discover: (
-                  <EuiLink external href={discoverLink} target="_blank">
-                    {i18n.EVALUATOR_FUN_FACT_DISCOVER_LINK}
-                  </EuiLink>
-                ),
-                apm: (
-                  <EuiLink external href={apmLink} target="_blank">
-                    {i18n.EVALUATOR_FUN_FACT_APM_LINK}
-                  </EuiLink>
-                ),
-              }}
             />
           </EuiText>
         </EuiFlexItem>
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
index e875b4c35b203..b1adb6296b2a1 100644
--- a/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
+++ b/x-pack/packages/kbn-elastic-assistant/impl/assistant/settings/evaluation_settings/translations.ts
@@ -17,7 +17,7 @@ export const SETTINGS_DESCRIPTION = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.settingsDescription',
   {
     defaultMessage:
-      'Run predictions and evaluations against test data sets using different models (connectors), agents, and evaluation schemes.',
+      'Run predictions against LangSmith test data sets using different models (connectors) and graphs.',
   }
 );
 
@@ -31,7 +31,7 @@ export const RUN_DETAILS_TITLE = i18n.translate(
 export const RUN_DETAILS_DESCRIPTION = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsDescription',
   {
-    defaultMessage: 'Configure test run details like project, run name, dataset, and output index.',
+    defaultMessage: 'Configure test run details like the run name and dataset.',
   }
 );
 
@@ -46,43 +46,7 @@ export const PREDICTION_DETAILS_DESCRIPTION = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsDescription',
   {
     defaultMessage:
-      'Choose models (connectors) and corresponding agents the dataset should run against.',
-  }
-);
-
-export const EVALUATION_DETAILS_TITLE = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsTitle',
-  {
-    defaultMessage: 'Evaluation (Optional)',
-  }
-);
-
-export const EVALUATION_DETAILS_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsDescription',
-  {
-    defaultMessage:
-      'Evaluate prediction results using a specific model (connector) and evaluation criterion.',
-  }
-);
-
-export const PROJECT_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectLabel',
-  {
-    defaultMessage: 'Project',
-  }
-);
-
-export const PROJECT_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectDescription',
-  {
-    defaultMessage: 'LangSmith project to write results to.',
-  }
-);
-
-export const PROJECT_PLACEHOLDER = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.projectPlaceholder',
-  {
-    defaultMessage: '8.12 Testing',
+      'Choose models (connectors) and corresponding graphs the dataset should run against.',
   }
 );
 
@@ -103,7 +67,7 @@ export const RUN_NAME_DESCRIPTION = i18n.translate(
 export const RUN_NAME_PLACEHOLDER = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.runNamePlaceholder',
   {
-    defaultMessage: '8.12 ESQL Query Generation',
+    defaultMessage: '8.16 Streaming Regression',
   }
 );
 
@@ -121,75 +85,17 @@ export const CONNECTORS_DESCRIPTION = i18n.translate(
   }
 );
 
-export const AGENTS_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsLabel',
-  {
-    defaultMessage: 'Agents',
-  }
-);
-
-export const AGENTS_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsDescription',
-  {
-    defaultMessage: 'Select the agents (RAG algorithms) to evaluate the dataset against.',
-  }
-);
-
-export const EVALUATOR_MODEL_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelLabel',
+export const GRAPHS_LABEL = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.graphsLabel',
   {
-    defaultMessage: 'Evaluator Model',
+    defaultMessage: 'Graphs',
   }
 );
 
-export const EVALUATOR_MODEL_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription',
+export const GRAPHS_DESCRIPTION = i18n.translate(
+  'xpack.elasticAssistant.assistant.settings.evaluationSettings.graphsDescription',
   {
-    defaultMessage: 'Model that performs the final evaluation.',
-  }
-);
-
-export const EVALUATION_TYPE_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel',
-  {
-    defaultMessage: 'Evaluation type',
-  }
-);
-
-export const EVALUATION_TYPE_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription',
-  {
-    defaultMessage:
-      'Type of evaluation to perform, e.g. "correctness" "esql-validator", or "custom".',
-  }
-);
-
-export const EVALUATION_PROMPT_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel',
-  {
-    defaultMessage: 'Evaluation prompt',
-  }
-);
-
-export const EVALUATION_PROMPT_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptDescription',
-  {
-    defaultMessage:
-      'Prompt template given `input`, `reference` and `prediction` template variables.',
-  }
-);
-export const EVALUATOR_OUTPUT_INDEX_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexLabel',
-  {
-    defaultMessage: 'Output index',
-  }
-);
-
-export const EVALUATOR_OUTPUT_INDEX_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexDescription',
-  {
-    defaultMessage:
-      'Index to write results to. Must be prefixed with ".kibana-elastic-ai-assistant-".',
+    defaultMessage: 'Select the different graphs to evaluate the dataset against.',
   }
 );
 
@@ -250,14 +156,7 @@ export const LANGSMITH_API_KEY_DESCRIPTION = i18n.translate(
 export const EVALUATOR_DATASET_LABEL = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel',
   {
-    defaultMessage: 'Dataset',
-  }
-);
-
-export const LANGSMITH_DATASET_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetLabel',
-  {
-    defaultMessage: 'LangSmith',
+    defaultMessage: 'LangSmith Dataset',
   }
 );
 
@@ -271,22 +170,7 @@ export const LANGSMITH_DATASET_DESCRIPTION = i18n.translate(
 export const LANGSMITH_DATASET_PLACEHOLDER = i18n.translate(
   'xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetPlaceholder',
   {
-    defaultMessage: 'ESQL Query Generation',
-  }
-);
-
-export const CUSTOM_DATASET_LABEL = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetLabel',
-  {
-    defaultMessage: 'Custom',
-  }
-);
-
-export const CUSTOM_DATASET_DESCRIPTION = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetDescription',
-  {
-    defaultMessage:
-      'Custom dataset to evaluate. Array of objects with "input" and "references" properties.',
+    defaultMessage: 'Select dataset...',
   }
 );
 
@@ -296,16 +180,3 @@ export const PERFORM_EVALUATION = i18n.translate(
     defaultMessage: 'Perform evaluation...',
   }
 );
-
-export const EVALUATOR_FUN_FACT_DISCOVER_LINK = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText',
-  {
-    defaultMessage: 'Discover',
-  }
-);
-export const EVALUATOR_FUN_FACT_APM_LINK = i18n.translate(
-  'xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactApmLinkText',
-  {
-    defaultMessage: 'APM',
-  }
-);
diff --git a/x-pack/plugins/elastic_assistant/common/constants.ts b/x-pack/plugins/elastic_assistant/common/constants.ts
index 45b473e848750..dd6e47e070591 100755
--- a/x-pack/plugins/elastic_assistant/common/constants.ts
+++ b/x-pack/plugins/elastic_assistant/common/constants.ts
@@ -17,9 +17,6 @@ export const ATTACK_DISCOVERY = `${BASE_PATH}/attack_discovery`;
 export const ATTACK_DISCOVERY_BY_CONNECTOR_ID = `${ATTACK_DISCOVERY}/{connectorId}`;
 export const ATTACK_DISCOVERY_CANCEL_BY_CONNECTOR_ID = `${ATTACK_DISCOVERY}/cancel/{connectorId}`;
 
-// Model Evaluation
-export const EVALUATE = `${BASE_PATH}/evaluate`;
-
 export const MAX_CONVERSATIONS_TO_UPDATE_IN_PARALLEL = 50;
 export const CONVERSATIONS_TABLE_MAX_PAGE_SIZE = 100;
 
diff --git a/x-pack/plugins/elastic_assistant/server/__mocks__/request.ts b/x-pack/plugins/elastic_assistant/server/__mocks__/request.ts
index 2407e09df1e55..9dc57bab25ef3 100644
--- a/x-pack/plugins/elastic_assistant/server/__mocks__/request.ts
+++ b/x-pack/plugins/elastic_assistant/server/__mocks__/request.ts
@@ -10,7 +10,6 @@ import {
   ATTACK_DISCOVERY_BY_CONNECTOR_ID,
   ATTACK_DISCOVERY_CANCEL_BY_CONNECTOR_ID,
   CAPABILITIES,
-  EVALUATE,
 } from '../../common/constants';
 import {
   AttackDiscoveryPostRequestBody,
@@ -23,11 +22,11 @@ import {
   ELASTIC_AI_ASSISTANT_CONVERSATIONS_URL_BY_ID,
   ELASTIC_AI_ASSISTANT_CONVERSATIONS_URL_BY_ID_MESSAGES,
   ELASTIC_AI_ASSISTANT_CONVERSATIONS_URL_FIND,
+  ELASTIC_AI_ASSISTANT_EVALUATE_URL,
   ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_URL,
   ELASTIC_AI_ASSISTANT_PROMPTS_URL_BULK_ACTION,
   ELASTIC_AI_ASSISTANT_PROMPTS_URL_FIND,
   PostEvaluateRequestBodyInput,
-  PostEvaluateRequestQueryInput,
 } from '@kbn/elastic-assistant-common';
 import {
   getAppendConversationMessagesSchemaMock,
@@ -74,18 +73,11 @@ export const getGetCapabilitiesRequest = () =>
     path: CAPABILITIES,
   });
 
-export const getPostEvaluateRequest = ({
-  body,
-  query,
-}: {
-  body: PostEvaluateRequestBodyInput;
-  query: PostEvaluateRequestQueryInput;
-}) =>
+export const getPostEvaluateRequest = ({ body }: { body: PostEvaluateRequestBodyInput }) =>
   requestMock.create({
     body,
     method: 'post',
-    path: EVALUATE,
-    query,
+    path: ELASTIC_AI_ASSISTANT_EVALUATE_URL,
   });
 
 export const getCurrentUserFindRequest = () =>
diff --git a/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.test.ts b/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.test.ts
deleted file mode 100644
index 7772c0d267273..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/execute_custom_llm_chain/index.test.ts
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { elasticsearchServiceMock } from '@kbn/core-elasticsearch-server-mocks';
-import { coreMock } from '@kbn/core/server/mocks';
-import { KibanaRequest } from '@kbn/core/server';
-import { actionsClientMock } from '@kbn/actions-plugin/server/actions_client/actions_client.mock';
-
-import { loggerMock } from '@kbn/logging-mocks';
-import { initializeAgentExecutorWithOptions, AgentExecutor } from 'langchain/agents';
-
-import { mockActionResponse } from '../../../__mocks__/action_result_data';
-import { langChainMessages } from '../../../__mocks__/lang_chain_messages';
-import { KNOWLEDGE_BASE_INDEX_PATTERN } from '../../../routes/knowledge_base/constants';
-import { callAgentExecutor } from '.';
-import { PassThrough, Stream } from 'stream';
-import {
-  ActionsClientChatOpenAI,
-  ActionsClientBedrockChatModel,
-  ActionsClientSimpleChatModel,
-} from '@kbn/langchain/server';
-import { AgentExecutorParams } from '../executors/types';
-import { ElasticsearchStore } from '../elasticsearch_store/elasticsearch_store';
-
-jest.mock('@kbn/langchain/server', () => {
-  const original = jest.requireActual('@kbn/langchain/server');
-  return {
-    ...original,
-    ActionsClientChatOpenAI: jest.fn(),
-    ActionsClientBedrockChatModel: jest.fn(),
-    ActionsClientSimpleChatModel: jest.fn(),
-  };
-});
-
-const mockConversationChain = {
-  call: jest.fn(),
-};
-
-jest.mock('langchain/chains', () => ({
-  RetrievalQAChain: {
-    fromLLM: jest.fn().mockImplementation(() => mockConversationChain),
-  },
-}));
-
-const mockCall = jest.fn().mockImplementation(() =>
-  Promise.resolve({
-    output: mockActionResponse,
-  })
-);
-const mockInvoke = jest.fn().mockImplementation(() => Promise.resolve());
-
-jest.mock('langchain/agents');
-
-jest.mock('../elasticsearch_store/elasticsearch_store', () => ({
-  ElasticsearchStore: jest.fn().mockImplementation(() => ({
-    asRetriever: jest.fn(),
-    isModelInstalled: jest.fn().mockResolvedValue(true),
-  })),
-}));
-const mockStream = new PassThrough();
-const mockPush = jest.fn();
-jest.mock('@kbn/ml-response-stream/server', () => ({
-  streamFactory: jest.fn().mockImplementation(() => ({
-    DELIMITER: '\n',
-    end: jest.fn(),
-    push: mockPush,
-    responseWithHeaders: {
-      body: mockStream,
-      headers: {
-        'X-Accel-Buffering': 'no',
-        'X-Content-Type-Options': 'nosniff',
-        'Cache-Control': 'no-cache',
-        Connection: 'keep-alive',
-        'Transfer-Encoding': 'chunked',
-      },
-    },
-  })),
-}));
-
-const mockConnectorId = 'mock-connector-id';
-
-// eslint-disable-next-line @typescript-eslint/no-explicit-any
-const mockRequest: KibanaRequest<unknown, unknown, any, any> = { body: {} } as KibanaRequest<
-  unknown,
-  unknown,
-  any, // eslint-disable-line @typescript-eslint/no-explicit-any
-  any // eslint-disable-line @typescript-eslint/no-explicit-any
->;
-
-const actionsClient = actionsClientMock.create();
-const mockLogger = loggerMock.create();
-const mockTelemetry = coreMock.createSetup().analytics;
-const esClientMock = elasticsearchServiceMock.createScopedClusterClient().asCurrentUser;
-const esStoreMock = new ElasticsearchStore(
-  esClientMock,
-  KNOWLEDGE_BASE_INDEX_PATTERN,
-  mockLogger,
-  mockTelemetry
-);
-const defaultProps: AgentExecutorParams<true> = {
-  actionsClient,
-  bedrockChatEnabled: false,
-  connectorId: mockConnectorId,
-  esClient: esClientMock,
-  esStore: esStoreMock,
-  llmType: 'openai',
-  langChainMessages,
-  logger: mockLogger,
-  onNewReplacements: jest.fn(),
-  request: mockRequest,
-  replacements: {},
-};
-const bedrockProps = {
-  ...defaultProps,
-  llmType: 'bedrock',
-};
-const bedrockChatProps = {
-  ...defaultProps,
-  bedrockChatEnabled: true,
-  llmType: 'bedrock',
-};
-const executorMock = initializeAgentExecutorWithOptions as jest.Mock;
-const agentExecutorMock = AgentExecutor as unknown as jest.Mock;
-
-describe('callAgentExecutor', () => {
-  beforeEach(() => {
-    jest.clearAllMocks();
-    executorMock.mockImplementation((_a, _b, { agentType }) => ({
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      call: (props: any, more: any) => mockCall({ ...props, agentType }, more),
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      invoke: (props: any, more: any) => mockInvoke({ ...props, agentType }, more),
-    }));
-    agentExecutorMock.mockReturnValue({
-      call: mockCall,
-      invoke: mockInvoke,
-    });
-  });
-
-  describe('callAgentExecutor', () => {
-    beforeEach(() => {
-      jest.clearAllMocks();
-    });
-
-    it('kicks off the chain with (only) the last message', async () => {
-      await callAgentExecutor(defaultProps);
-
-      expect(mockCall.mock.calls[0][0].input).toEqual('\n\nDo you know my name?');
-    });
-
-    it('kicks off the chain with the expected message when langChainMessages has only one entry', async () => {
-      const onlyOneMessage = [langChainMessages[0]];
-
-      await callAgentExecutor({
-        ...defaultProps,
-        langChainMessages: onlyOneMessage,
-      });
-      expect(mockCall.mock.calls[0][0].input).toEqual('What is my name?');
-    });
-  });
-
-  describe('OpenAI', () => {
-    describe('when the agent is not streaming', () => {
-      it('creates an instance of ActionsClientChatOpenAI with the expected context from the request', async () => {
-        await callAgentExecutor(defaultProps);
-
-        expect(ActionsClientChatOpenAI).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          streaming: false,
-          temperature: 0.2,
-          llmType: 'openai',
-        });
-      });
-
-      it('uses the openai-functions agent type', async () => {
-        await callAgentExecutor(defaultProps);
-        expect(mockCall.mock.calls[0][0].agentType).toEqual('openai-functions');
-      });
-
-      it('returns the expected response', async () => {
-        const result = await callAgentExecutor(defaultProps);
-
-        expect(result).toEqual({
-          body: {
-            connector_id: 'mock-connector-id',
-            data: mockActionResponse,
-            status: 'ok',
-            replacements: {},
-            trace_data: undefined,
-          },
-          headers: {
-            'content-type': 'application/json',
-          },
-        });
-      });
-    });
-    describe('when the agent is streaming', () => {
-      it('creates an instance of ActionsClientChatOpenAI with the expected context from the request', async () => {
-        await callAgentExecutor({ ...defaultProps, isStream: true });
-
-        expect(ActionsClientChatOpenAI).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          streaming: true,
-          temperature: 0.2,
-          llmType: 'openai',
-        });
-      });
-
-      it('uses the openai-functions agent type', async () => {
-        await callAgentExecutor({ ...defaultProps, isStream: true });
-        expect(mockInvoke.mock.calls[0][0].agentType).toEqual('openai-functions');
-      });
-    });
-  });
-
-  describe('Bedrock', () => {
-    describe('when the agent is not streaming', () => {
-      it('creates an instance of ActionsClientSimpleChatModel with the expected context from the request', async () => {
-        await callAgentExecutor(bedrockProps);
-
-        expect(ActionsClientSimpleChatModel).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          streaming: false,
-          temperature: 0,
-          llmType: 'bedrock',
-        });
-      });
-
-      it('uses the structured-chat-zero-shot-react-description agent type', async () => {
-        await callAgentExecutor(bedrockProps);
-        expect(mockCall.mock.calls[0][0].agentType).toEqual(
-          'structured-chat-zero-shot-react-description'
-        );
-      });
-
-      it('returns the expected response', async () => {
-        const result = await callAgentExecutor(bedrockProps);
-
-        expect(result).toEqual({
-          body: {
-            connector_id: 'mock-connector-id',
-            data: mockActionResponse,
-            status: 'ok',
-            replacements: {},
-            trace_data: undefined,
-          },
-          headers: {
-            'content-type': 'application/json',
-          },
-        });
-      });
-    });
-    describe('when the agent is streaming', () => {
-      it('creates an instance of ActionsClientSimpleChatModel with the expected context from the request', async () => {
-        await callAgentExecutor({ ...bedrockProps, isStream: true });
-
-        expect(ActionsClientSimpleChatModel).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          streaming: true,
-          temperature: 0,
-          llmType: 'bedrock',
-        });
-      });
-
-      it('uses the structured-chat-zero-shot-react-description agent type', async () => {
-        await callAgentExecutor({ ...bedrockProps, isStream: true });
-        expect(mockInvoke.mock.calls[0][0].agentType).toEqual(
-          'structured-chat-zero-shot-react-description'
-        );
-      });
-    });
-  });
-
-  describe('BedrockChat', () => {
-    describe('when the agent is not streaming', () => {
-      it('creates an instance of ActionsClientBedrockChatModel with the expected context from the request', async () => {
-        await callAgentExecutor(bedrockChatProps);
-
-        expect(ActionsClientBedrockChatModel).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          signal: undefined,
-          model: undefined,
-          streaming: false,
-          temperature: 0,
-          llmType: 'bedrock',
-        });
-      });
-
-      it('returns the expected response', async () => {
-        const result = await callAgentExecutor(bedrockChatProps);
-
-        expect(result).toEqual({
-          body: {
-            connector_id: 'mock-connector-id',
-            data: mockActionResponse,
-            status: 'ok',
-            replacements: {},
-            trace_data: undefined,
-          },
-          headers: {
-            'content-type': 'application/json',
-          },
-        });
-      });
-    });
-    describe('when the agent is streaming', () => {
-      it('creates an instance of ActionsClientBedrockChatModel with the expected context from the request', async () => {
-        await callAgentExecutor({ ...bedrockChatProps, isStream: true });
-
-        expect(ActionsClientBedrockChatModel).toHaveBeenCalledWith({
-          actionsClient,
-          connectorId: mockConnectorId,
-          logger: mockLogger,
-          maxRetries: 0,
-          signal: undefined,
-          model: undefined,
-          streaming: true,
-          temperature: 0,
-          llmType: 'bedrock',
-        });
-      });
-    });
-  });
-
-  describe.each([
-    ['OpenAI', defaultProps],
-    ['Bedrock', bedrockProps],
-  ])('Common streaming tests - %s', (_, theProps) => {
-    it('returns the expected response', async () => {
-      const result = await callAgentExecutor({ ...theProps, isStream: true });
-      expect(result.body).toBeInstanceOf(Stream.PassThrough);
-      expect(result.headers).toEqual({
-        'Cache-Control': 'no-cache',
-        Connection: 'keep-alive',
-        'Transfer-Encoding': 'chunked',
-        'X-Accel-Buffering': 'no',
-        'X-Content-Type-Options': 'nosniff',
-      });
-    });
-
-    it('onLlmResponse gets called only after final chain step', async () => {
-      const mockInvokeWithChainCallback = jest.fn().mockImplementation((a, b, c, d, e, f, g) => {
-        b.callbacks[0].handleChainEnd({ output: 'hi' }, '123', '456');
-        b.callbacks[0].handleChainEnd({ output: 'hello' }, '123');
-        return Promise.resolve();
-      });
-      (initializeAgentExecutorWithOptions as jest.Mock).mockImplementation(
-        (_a, _b, { agentType }) => ({
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          call: (props: any, more: any) => mockCall({ ...props, agentType }, more),
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          invoke: (props: any, more: any) =>
-            mockInvokeWithChainCallback({ ...props, agentType }, more),
-        })
-      );
-      const onLlmResponse = jest.fn(async () => {}); // We need it to be a promise, or it'll crash because of missing `.catch`
-      await callAgentExecutor({ ...theProps, onLlmResponse, isStream: true });
-
-      expect(onLlmResponse).toHaveBeenCalledWith(
-        'hello',
-        {
-          traceId: undefined,
-          transactionId: undefined,
-        },
-        false
-      );
-    });
-
-    it('does not streams token after handleStreamEnd has been called', async () => {
-      const mockInvokeWithChainCallback = jest.fn().mockImplementation((a, b, c, d, e, f, g) => {
-        b.callbacks[0].handleLLMNewToken('hi', {}, '123', '456');
-        b.callbacks[0].handleChainEnd({ output: 'hello' }, '123');
-        b.callbacks[0].handleLLMNewToken('hey', {}, '678', '456');
-        return Promise.resolve();
-      });
-      (initializeAgentExecutorWithOptions as jest.Mock).mockImplementation(
-        (_a, _b, { agentType }) => ({
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          call: (props: any, more: any) => mockCall({ ...props, agentType }, more),
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          invoke: (props: any, more: any) =>
-            mockInvokeWithChainCallback({ ...props, agentType }, more),
-        })
-      );
-      const onLlmResponse = jest.fn(async () => {}); // We need it to be a promise, or it'll crash because of missing `.catch`
-      await callAgentExecutor({ ...theProps, onLlmResponse, isStream: true });
-
-      expect(mockPush).toHaveBeenCalledWith({ payload: 'hi', type: 'content' });
-      expect(mockPush).not.toHaveBeenCalledWith({ payload: 'hey', type: 'content' });
-    });
-
-    it('only streams tokens with length from the root parentRunId', async () => {
-      const mockInvokeWithChainCallback = jest.fn().mockImplementation((a, b, c, d, e, f, g) => {
-        b.callbacks[0].handleLLMNewToken('', {}, '123', '456');
-
-        b.callbacks[0].handleLLMNewToken('hi', {}, '123', '456');
-        b.callbacks[0].handleLLMNewToken('hello', {}, '555', '666');
-        b.callbacks[0].handleLLMNewToken('hey', {}, '678', '456');
-        return Promise.resolve();
-      });
-      (initializeAgentExecutorWithOptions as jest.Mock).mockImplementation(
-        (_a, _b, { agentType }) => ({
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          call: (props: any, more: any) => mockCall({ ...props, agentType }, more),
-          // eslint-disable-next-line @typescript-eslint/no-explicit-any
-          invoke: (props: any, more: any) =>
-            mockInvokeWithChainCallback({ ...props, agentType }, more),
-        })
-      );
-      const onLlmResponse = jest.fn();
-      await callAgentExecutor({ ...theProps, onLlmResponse, isStream: true });
-
-      expect(mockPush).toHaveBeenCalledWith({ payload: 'hi', type: 'content' });
-      expect(mockPush).toHaveBeenCalledWith({ payload: 'hey', type: 'content' });
-      expect(mockPush).not.toHaveBeenCalledWith({ payload: 'hello', type: 'content' });
-      expect(mockPush).not.toHaveBeenCalledWith({ payload: '', type: 'content' });
-    });
-  });
-});
diff --git a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/index.ts b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/index.ts
deleted file mode 100644
index 5103135310eb3..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/index.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { AgentExecutor } from './types';
-import { callAgentExecutor } from '../execute_custom_llm_chain';
-import { callOpenAIFunctionsExecutor } from './openai_functions_executor';
-
-/**
- * To support additional Agent Executors from the UI, add them to this map
- * and reference your specific AgentExecutor function
- */
-export const AGENT_EXECUTOR_MAP: Record<string, AgentExecutor<false>> = {
-  DefaultAgentExecutor: callAgentExecutor,
-  OpenAIFunctionsExecutor: callOpenAIFunctionsExecutor,
-};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts b/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
deleted file mode 100644
index edea22a888dff..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/executors/openai_functions_executor.ts
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { initializeAgentExecutorWithOptions } from 'langchain/agents';
-import { RetrievalQAChain } from 'langchain/chains';
-import { BufferMemory, ChatMessageHistory } from 'langchain/memory';
-import { ChainTool } from 'langchain/tools/chain';
-
-import { ActionsClientLlm } from '@kbn/langchain/server';
-import { APMTracer } from '@kbn/langchain/server/tracers/apm';
-import { withAssistantSpan } from '../tracers/apm/with_assistant_span';
-import { AgentExecutor } from './types';
-
-export const OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID =
-  'Elastic AI Assistant Agent Executor (OpenAI Functions)';
-
-/**
- * This is an agent executor to be used with the model evaluation API for benchmarking.
- * Currently just a copy of `callAgentExecutor`, but using the `openai-functions` agent type.
- *
- * NOTE: This is not to be used in production as-is, and must be used with an OpenAI ConnectorId
- */
-export const callOpenAIFunctionsExecutor: AgentExecutor<false> = async ({
-  actionsClient,
-  connectorId,
-  esClient,
-  esStore,
-  langChainMessages,
-  llmType,
-  logger,
-  request,
-  traceOptions,
-}) => {
-  const llm = new ActionsClientLlm({
-    actionsClient,
-    connectorId,
-    llmType,
-    logger,
-    model: request.body.model,
-  });
-
-  const pastMessages = langChainMessages.slice(0, -1); // all but the last message
-  const latestMessage = langChainMessages.slice(-1); // the last message
-
-  const memory = new BufferMemory({
-    chatHistory: new ChatMessageHistory(pastMessages),
-    memoryKey: 'chat_history', // this is the key expected by https://github.com/langchain-ai/langchainjs/blob/a13a8969345b0f149c1ca4a120d63508b06c52a5/langchain/src/agents/initialize.ts#L166
-    inputKey: 'input',
-    outputKey: 'output',
-    returnMessages: true,
-  });
-
-  const modelExists = await esStore.isModelInstalled();
-  if (!modelExists) {
-    throw new Error(
-      'Please ensure ELSER is configured to use the Knowledge Base, otherwise disable the Knowledge Base in Advanced Settings to continue.'
-    );
-  }
-
-  // Create a chain that uses the ELSER backed ElasticsearchStore, override k=10 for esql query generation for now
-  const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever(10));
-
-  // TODO: Dependency inject these tools
-  const tools = [
-    new ChainTool({
-      name: 'ESQLKnowledgeBaseTool',
-      description:
-        'Call this for knowledge on how to build an ESQL query, or answer questions about the ES|QL query language.',
-      chain,
-      tags: ['esql', 'query-generation', 'knowledge-base'],
-    }),
-  ];
-
-  const executor = await initializeAgentExecutorWithOptions(tools, llm, {
-    agentType: 'openai-functions',
-    memory,
-    verbose: false,
-  });
-
-  // Sets up tracer for tracing executions to APM. See x-pack/plugins/elastic_assistant/server/lib/langchain/tracers/README.mdx
-  // If LangSmith env vars are set, executions will be traced there as well. See https://docs.smith.langchain.com/tracing
-  const apmTracer = new APMTracer({ projectName: traceOptions?.projectName ?? 'default' }, logger);
-
-  let traceData;
-
-  // Wrap executor call with an APM span for instrumentation
-  const langChainResponse = await withAssistantSpan(
-    OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID,
-    async (span) => {
-      if (span?.transaction?.ids['transaction.id'] != null && span?.ids['trace.id'] != null) {
-        traceData = {
-          // Transactions ID since this span is the parent
-          transaction_id: span.transaction.ids['transaction.id'],
-          trace_id: span.ids['trace.id'],
-        };
-        span.addLabels({ evaluationId: traceOptions?.evaluationId });
-      }
-
-      return executor.call(
-        { input: latestMessage[0].content },
-        {
-          callbacks: [apmTracer, ...(traceOptions?.tracers ?? [])],
-          runName: OPEN_AI_FUNCTIONS_AGENT_EXECUTOR_ID,
-          tags: traceOptions?.tags ?? [],
-        }
-      );
-    }
-  );
-
-  return {
-    body: {
-      connector_id: connectorId,
-      data: langChainResponse.output, // the response from the actions framework
-      trace_data: traceData,
-      status: 'ok',
-    },
-    headers: {
-      'content-type': 'application/json',
-    },
-    connector_id: connectorId,
-    data: langChainResponse.output, // the response from the actions framework
-    trace_data: traceData,
-    status: 'ok',
-  };
-};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/default_assistant_graph/graph.ts b/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/default_assistant_graph/graph.ts
index ab244837475c7..2708c3b9c5702 100644
--- a/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/default_assistant_graph/graph.ts
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/default_assistant_graph/graph.ts
@@ -37,7 +37,7 @@ import { RESPOND_NODE, respond } from './nodes/respond';
 
 export const DEFAULT_ASSISTANT_GRAPH_ID = 'Default Security Assistant Graph';
 
-interface GetDefaultAssistantGraphParams {
+export interface GetDefaultAssistantGraphParams {
   agentRunnable: AgentRunnableSequence;
   dataClients?: AssistantDataClients;
   conversationId?: string;
diff --git a/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/index.ts b/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/index.ts
new file mode 100644
index 0000000000000..706da7197f31a
--- /dev/null
+++ b/x-pack/plugins/elastic_assistant/server/lib/langchain/graphs/index.ts
@@ -0,0 +1,23 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import {
+  getDefaultAssistantGraph,
+  GetDefaultAssistantGraphParams,
+  DefaultAssistantGraph,
+} from './default_assistant_graph/graph';
+
+export type GetAssistantGraph = (params: GetDefaultAssistantGraphParams) => DefaultAssistantGraph;
+
+/**
+ * Map of the different Assistant Graphs. Useful for running evaluations.
+ */
+export const ASSISTANT_GRAPH_MAP: Record<string, GetAssistantGraph> = {
+  DefaultAssistantGraph: getDefaultAssistantGraph,
+  // TODO: Support additional graphs
+  // AttackDiscoveryGraph: getDefaultAssistantGraph,
+};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/esql_dataset.json b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/esql_dataset.json
deleted file mode 100644
index 69305e7bcf44c..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/esql_dataset.json
+++ /dev/null
@@ -1,126 +0,0 @@
-[
-  {
-    "input": "Generate an ES|QL query that will count the number of connections made to external IP addresses, broken down by user. If the count is greater than 100 for a specific user, add a new field called \"follow_up\" that contains a value of \"true\", otherwise, it should contain \"false\". The user names should also be enriched with their respective group names.",
-    "reference": "FROM logs-*\n| WHERE NOT CIDR_MATCH(destination.ip, \"10.0.0.0/8\", \"172.16.0.0/12\", \"192.168.0.0/16\")\n| STATS destcount = COUNT(destination.ip) by user.name, host.name\n| ENRICH ldap_lookup_new ON user.name\n| WHERE group.name IS NOT NULL\n| EVAL follow_up = CASE(\n    destcount >= 100, \"true\",\n     \"false\")\n| SORT destcount desc\n| KEEP destcount, host.name, user.name, group.name, follow_up"
-  },
-  {
-    "input": "Generate an ES|QL query that will parse the DNS registered domain from a DNS query, count the number distinct DNS queries being made per DNS registered domain and filter for when the distinct count is greater than 5. The query should sort the results by the distinct count of queries in descending order.",
-    "reference": "from logs-*\n| grok dns.question.name \"%{DATA}\\\\.%{GREEDYDATA:dns.question.registered_domain:string}\"\n| stats unique_queries = count_distinct(dns.question.name) by dns.question.registered_domain, process.name\n| where unique_queries > 5\n| sort unique_queries desc"
-  },
-  {
-    "input": "Generate an ES|QL query that will filter all logs for those containing windows event codes, count them by host name, and enrich the codes with a description of what the code means (via an enrichment policy called \"win_events\"). The results should be sorted by the count of each code in descending order",
-    "reference": "from logs-*\n| where event.code is not null\n| stats event_code_count = count(event.code) by event.code,host.name\n| enrich win_events on event.code with EVENT_DESCRIPTION\n| where EVENT_DESCRIPTION is not null and host.name is not null\n| rename EVENT_DESCRIPTION as event.description\n| sort event_code_count desc\n| keep event_code_count,event.code,host.name,event.description\""
-  },
-  {
-    "input": "Generate an ES|QL query that will filter for file creation events. Count the number of file interactions by the process interacting with the file, and the host name. From the process name field, parse and output two new fields that represent the process and the process extension separately. Calculate the length of the process name and filter for events where the length is greater than 15 characters. Sort the result based on the process length and filecount in descending order. Limit the results to the top 10.",
-    "reference": "from logs-*\n| where event.category == \"file\" and event.action == \"creation\"\n| stats filecount = count(file.name) by process.name,host.name\n| dissect process.name \"%{process}.%{extension}\" \n| eval proclength = length(process.name)\n| where proclength > 10 \n| sort filecount,proclength desc\n| limit 10 \n| keep host.name,process.name,filecount,process,extension,fullproc,proclength"
-  },
-  {
-    "input": "Generate an ES|QL query that will look for all process events for the process \"curl.exe\". Calculate the sum of outbund bytes for this process by the destination address. Output the results in KB, also sorted by KB in descending order. Limit to the top 10 results.",
-    "reference": "from logs-*\n| where process.name == \"curl.exe\"\n| stats bytes = sum(destination.bytes) by destination.address\n| eval kb =  bytes/1024\n| sort kb desc\n| limit 10\n| keep kb,destination.address"
-  },
-  {
-    "input": "I want to see a query for metrics-apm*, filtering on metricset.name:transaction and metricset.interval:1m, showing the average duration (via transaction.duration.histogram), in 50 buckets.",
-    "reference": "FROM metrics-apm*\n| WHERE metricset.name == \"transaction\" AND metricset.interval == \"1m\"\n| EVAL bucket = AUTO_BUCKET(transaction.duration.histogram, 50, <start-date>, <end-date>)\n| STATS avg_duration = AVG(transaction.duration.histogram) BY bucket"
-  },
-  {
-    "input": "For standard Elastic ECS compliant packetbeat data view, create an ES|QL query that shows the top 10 unique domains by doc count",
-    "reference": "FROM packetbeat-*\n| STATS doc_count = COUNT(destination.domain) BY destination.domain\n| SORT doc_count DESC\n| LIMIT 10"
-  },
-  {
-    "input": "From employees, I want to see the 5 earliest employees (hire_date), I want to display only the month and the year that they were hired in and their employee number (emp_no). Format the date as e.g. \"September 2019\". Only show the query",
-    "reference": "FROM employees\n| EVAL hire_date_formatted = DATE_FORMAT(hire_date, \"MMMM yyyy\")\n| SORT hire_date\n| KEEP emp_no, hire_date_formatted\n| LIMIT 5"
-  },
-  {
-    "input": "From employees, I want to sort the documents by salary, and then return 10 results per page, and then see the second page",
-    "reference": "Pagination is not supported"
-  },
-  {
-    "input": "My logs data (ECS) is in `logs-*`. Show me a query that gets the average CPU per host, limit it to the top 10 results, in 1m buckets, and only include the last 15m.",
-    "reference": "FROM logs-*\n| WHERE @timestamp >= NOW() - 15 minutes\n| EVAL bucket = DATE_TRUNC(1 minute, @timestamp)\n| STATS avg_cpu = AVG(system.cpu.total.norm.pct) BY bucket, host.name\n| LIMIT 10"
-  },
-  {
-    "input": "I want to show a list of services with APM data. My data is in `traces-apm*`. I want to show the average transaction duration, the success rate (by dividing event.outcome:failure by event.outcome:failure+success), and total amount of requests. As a time range, select the last 24 hours. Just show me the query.",
-    "reference": "FROM traces-apm*\n| WHERE @timestamp >= NOW() - 24 hours\n| EVAL successful = CASE(event.outcome == \"success\", 1, 0),\n  failed = CASE(event.outcome == \"failure\", 1, 0)\n| STATS success_rate = AVG(successful), \n  avg_duration = AVG(transaction.duration), \n  total_requests = COUNT(transaction.id) BY service.name"
-  },
-  {
-    "input": "from `metricbeat*`, I want to see the percentage of CPU time normalized by the number of CPU cores, broken down by hostname. the fields are system.cpu.user.pct, system.cpu.system.pct, and system.cpu.cores. just show me the query",
-    "reference": "FROM metricbeat*\n| EVAL cpu_pct_normalized = (system.cpu.user.pct + system.cpu.system.pct) / system.cpu.cores\n| STATS AVG(cpu_pct_normalized) BY host.name"
-  },
-  {
-    "input": "I want to see a query that does the following: extract the query duration from postgres log messages, and calculate the avg",
-    "reference": "FROM postgres-logs\n| DISSECT message \"%{} duration: %{query_duration} ms\"\n| EVAL query_duration_num = TO_DOUBLE(query_duration)\n| STATS avg_duration = AVG(query_duration_num)"
-  },
-  {
-    "input": "From `nyc_taxis`, give me the top 10 results where the drop off time was between 6am and 10am. Just give me the query.",
-    "reference": "FROM nyc_taxis\n| WHERE DATE_EXTRACT(drop_off_time, \"hour\") >= 6 AND DATE_EXTRACT(drop_off_time, \"hour\") < 10\n| LIMIT 10"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Failed Logins from a Single IP",
-    "reference": "FROM logs-*\n| WHERE event.action == \"failed_login\" \n| STATS login_counts = COUNT(event.action) by source.ip\n| WHERE login_counts > 5"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Large Data Transfers\n\n",
-    "reference": "FROM logs-*\n| WHERE network.bytes > 1000000\n| KEEP source.ip, destination.ip, network.bytes"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Rare User Agents\n\n\n\n",
-    "reference": "FROM logs-*\n| STATS user_agent_count = COUNT(user_agent.original) by user_agent.original\n| WHERE user_agent_count < 5\n| KEEP user_agent.original"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Potential Beaconing Activity\n\n\n\n",
-    "reference": "FROM logs-*\n| STATS domain_requests = COUNT(url.domain) by source.ip, domain\n| WHERE domain_requests > 100\n| KEEP source.ip, url.domain"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Uncommon Processes",
-    "reference": "FROM logs-*\n| STATS process_count = COUNT(process.name) by process.name\n| WHERE process_count < 3\n| KEEP process.name"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nLocating Suspicious Outbound Connections",
-    "reference": "FROM logs-*\n| WHERE destination.port NOT IN (80, 443) and direction == \"outbound\"\n| KEEP source.ip, destination.ip, destination.port"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Processes Running from Temporary Directories",
-    "reference": "FROM logs-*\n| WHERE process.working_directory RLIKE \"/tmp.*\"\n| KEEP process.name, process.working_directory"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nDetecting Connections to Non-Standard SSH Ports\n\n",
-    "reference": "FROM logs-*\n| WHERE destination.port NOT IN (22) AND process.name == \"ssh\"\n| KEEP source.ip, destination.ip, destination.port"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Potential Phishing Domains\n\n",
-    "reference": "FROM logs-*\n| WHERE url.domain RLIKE \".*paypa1.*|.*banking.*\"\n| KEEP source.ip, url.domain"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nLocating Suspicious Concatenated Strings in Logs (Possible Script or Malware)",
-    "reference": "FROM logs-*\n| WHERE CONCAT(\"evil\", \"payload\") IN log.message\n| KEEP log.message"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nUncovering Connections to Non-Standard SSH Ports Outside of Local Network",
-    "reference": "FROM logs-*\n| WHERE NOT CIDR_MATCH(destination.ip, \"10.0.0.0/8\", \"192.168.0.0/16\")\n| KEEP source.ip, destination.ip, network.bytes"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nUncovering Connections to Non-Standard SSH Ports Outside of Local Network",
-    "reference": "FROM logs-*\n| WHERE CIDR_MATCH(ip, \"10.0.0.0/8\", \"192.168.0.0/16\")\n| DISSECT user_agent \"%{browser_name}/%{browser_version} (%{os_name}; %{os_version})\""
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Potential PowerShell Exploits\n\n",
-    "reference": "FROM logs-*\n| WHERE process.name == \"powershell.exe\" and process.command_line RLIKE \".*EncodedCommand.*\"\n| KEEP host.name, user.name, command_line, @timestamp"
-  },
-  {
-    "input": "Generate an ES|QL query that will :\nCategorize Data Transfer Sizes\n\n",
-    "reference": "FROM logs-*\n| EVAL transfer_category = CASE(\n    bytes_transferred < 1000, \"Small\",\n    bytes_transferred < 100000, \"Medium\",\n    \"Large\")\n| STATS transfer_count = COUNT(transfer_category) by transfer_category\n| KEEP transfer_category, transfer_count"
-  },
-  {
-    "input": "Generate an ES|QL query that will:\nCategorizing Failed Logins by Reason",
-    "reference": "FROM logs-*\n| WHERE event.action == \"failed_login\"\n| EVAL failure_reason = CASE(\n    error_code == \"404\", \"Not Found\",\n    error_code == \"403\", \"Forbidden\",\n    \"Other\")\n| STATS failure_count = COUNT(failure_reason) by failure_reason\n| KEEP failure_reason, failure_count"
-  },
-  {
-    "input": "Generate an ES|QL query that will search for the following use case: \nIdentifying Distinct IP counts Accessing Rare Endpoints",
-    "reference": "FROM logs-*\n| WHERE url.path RLIKE \"/admin/*\"\n| STATS unique_count = COUNT_DISTINCT(source.ip) by source.ip\n| KEEP unique_count, source.ip"
-  },
-  {
-    "input": "Generate an ES|QL query that will:\nIdentify Data Exfiltration by Monitoring Outgoing Data Size",
-    "reference": "FROM logs-*\n| WHERE network.direction == \"outbound\"\n| STATS data_size = SUM(bytes_transferred) by destination.ip\n| WHERE data_size > 1000000\n| KEEP destination.ip, data_size"
-  }
-]
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/query_dataset.json b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/query_dataset.json
deleted file mode 100644
index ed61d58da4ee5..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/query_dataset.json
+++ /dev/null
@@ -1,38 +0,0 @@
-[
-  {
-    "input": "Generate an EQL Query to detect data exfiltration attempts on linux systems. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "This query looks for a sequence of events, first for a shell process (e.g., bash, zsh, etc.) execution, and then for an outbound network connection from that process within a specified list of ports commonly used for data exfiltration. \n\nsequence by process.entity_id\n  [process\n    where process.name : (\"bash\", \"sh\", \"zsh\", \"dash\", \"ksh\", \"tcsh\", \"fish\", \"csh\", \"pwsh\")\n  ]\n  [network\n    where network.direction == \"outbound\" and\n    destination.port : (20, 21, 22, 25, 80, 110, 143, 443, 465, 587, 993, 995, 3389, 5601)\n  ]"
-  },
-  {
-    "input": "Generate an EQL query which will allow me to detect brute force attempts against my web servers.The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "This query looks for a sequence of events where three consecutive failed authentication attempts from the same source IP are followed by a successful login within a 5-minute span.\n\n sequence by source.ip with maxspan=5m\n  [ authentication where event.outcome == \"failure\" ] with runs=3\n  [ authentication where event.outcome == \"success\" ]\n"
-  },
-  {
-    "input": "Generate an EQL query that will detect suspicious process activity from Microsoft office programs. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Identifies suspicious child processes of frequently targeted Microsoft Office applications (Word, PowerPoint, Excel). These child processes are often launched during exploitation of Office applications or from documents with malicious macros.\n\nprocess where host.os.type == \"windows\" and event.type == \"start\" and\n  process.parent.name : (\"eqnedt32.exe\", \"excel.exe\", \"fltldr.exe\", \"msaccess.exe\", \"mspub.exe\", \"powerpnt.exe\", \"winword.exe\", \"outlook.exe\") and\n  process.name : (\"Microsoft.Workflow.Compiler.exe\", \"arp.exe\", \"atbroker.exe\", \"bginfo.exe\", \"bitsadmin.exe\", \"cdb.exe\", \"certutil.exe\",\n                \"cmd.exe\", \"cmstp.exe\", \"control.exe\", \"cscript.exe\", \"csi.exe\", \"dnx.exe\", \"dsget.exe\", \"dsquery.exe\", \"forfiles.exe\",\n                \"fsi.exe\", \"ftp.exe\", \"gpresult.exe\", \"hostname.exe\", \"ieexec.exe\", \"iexpress.exe\", \"installutil.exe\", \"ipconfig.exe\",\n                \"mshta.exe\", \"msxsl.exe\", \"nbtstat.exe\", \"net.exe\", \"net1.exe\", \"netsh.exe\", \"netstat.exe\", \"nltest.exe\", \"odbcconf.exe\",\n                \"ping.exe\", \"powershell.exe\", \"pwsh.exe\", \"qprocess.exe\", \"quser.exe\", \"qwinsta.exe\", \"rcsi.exe\", \"reg.exe\", \"regasm.exe\",\n                \"regsvcs.exe\", \"regsvr32.exe\", \"sc.exe\", \"schtasks.exe\", \"systeminfo.exe\", \"tasklist.exe\", \"tracert.exe\", \"whoami.exe\",\n                \"wmic.exe\", \"wscript.exe\", \"xwizard.exe\", \"explorer.exe\", \"rundll32.exe\", \"hh.exe\", \"msdt.exe\")"
-  },
-  {
-    "input": "Generate an EQL query that will detect the enumeration of files and directories using built-in tools. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Enumeration of files and directories using built-in tools. Adversaries may use the information discovered to plan follow-on activity.\n\n\nsequence by agent.id, user.name with maxspan=1m\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]\n[process where event.type in (\"start\", \"process_started\") and\n  ((process.name : \"cmd.exe\" or process.pe.original_file_name == \"Cmd.Exe\") and process.args : \"dir\") or\n    process.name : \"tree.com\"]"
-  },
-  {
-    "input": "Generate an EQL query that will detect unusual child proceses of RunDLL32. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Identifies child processes of unusual instances of RunDLL32 where the command line parameters were suspicious. Misuse of RunDLL32 could indicate malicious activity.\n\nsequence with maxspan=1h\n  [process where host.os.type == \"windows\" and event.type == \"start\" and\n     (process.name : \"rundll32.exe\" or process.pe.original_file_name == \"RUNDLL32.EXE\") and\n      process.args_count == 1\n  ] by process.entity_id\n  [process where host.os.type == \"windows\" and event.type == \"start\" and process.parent.name : \"rundll32.exe\"\n  ] by process.parent.entity_id"
-  },
-  {
-    "input": "Generate an EQL query that will detect Multiple Logon Failures Followed by Logon Success. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Identifies multiple logon failures followed by a successful one from the same source address. Adversaries will often brute force login attempts across multiple users with a common or known password, in an attempt to gain access to accounts.\n\nsequence by winlog.computer_name, source.ip with maxspan=5s\n  [authentication where event.action == \"logon-failed\" and\n    /* event 4625 need to be logged */\n    winlog.logon.type : \"Network\" and\n    source.ip != null and source.ip != \"127.0.0.1\" and source.ip != \"::1\" and\n    not user.name : (\"ANONYMOUS LOGON\", \"-\", \"*$\") and not user.domain == \"NT AUTHORITY\" and\n\n    /* noisy failure status codes often associated to authentication misconfiguration */\n    not winlog.event_data.Status : (\"0xC000015B\", \"0XC000005E\", \"0XC0000133\", \"0XC0000192\")] with runs=5\n  [authentication where event.action == \"logged-in\" and\n    /* event 4624 need to be logged */\n    winlog.logon.type : \"Network\" and\n    source.ip != null and source.ip != \"127.0.0.1\" and source.ip != \"::1\" and\n    not user.name : (\"ANONYMOUS LOGON\", \"-\", \"*$\") and not user.domain == \"NT AUTHORITY\"]"
-  },
-  {
-    "input": "Generate an EQL query that will detect potential sudo hijacking. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Identifies the creation of a sudo binary located at /usr/bin/sudo. Attackers may hijack the default sudo binary and replace it with a custom binary or script that can read the user's password in clear text to escalate privileges or enable persistence onto the system every time the sudo binary is executed.\n\nfile where event.type in (\"creation\", \"file_create_event\") and file.path == \"/usr/bin/sudo\""
-  },
-  {
-    "input": "Generate an EQL query that will detect Tampering of Bash Command-Line History. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Adversaries may attempt to clear or disable the Bash command-line history in an attempt to evade detection or forensic investigations.\n\nprocess where event.type in (\"start\", \"process_started\") and\n (\n  ((process.args : (\"rm\", \"echo\") or\n    (process.args : \"ln\" and process.args : \"-sf\" and process.args : \"/dev/null\") or\n    (process.args : \"truncate\" and process.args : \"-s0\"))\n    and process.args : (\".bash_history\", \"/root/.bash_history\", \"/home/*/.bash_history\",\"/Users/.bash_history\", \"/Users/*/.bash_history\",\n                        \".zsh_history\", \"/root/.zsh_history\", \"/home/*/.zsh_history\", \"/Users/.zsh_history\", \"/Users/*/.zsh_history\")) or\n  (process.name : \"history\" and process.args : \"-c\") or\n  (process.args : \"export\" and process.args : (\"HISTFILE=/dev/null\", \"HISTFILESIZE=0\")) or\n  (process.args : \"unset\" and process.args : \"HISTFILE\") or\n  (process.args : \"set\" and process.args : \"history\" and process.args : \"+o\")\n )"
-  },
-  {
-    "input": "Generate an EQL query that will detect an Attempt to Remove File Quarantine Attribute on macOS. The query should be immediately usable in an Elastic security timeline or detection rule.",
-    "reference": "Identifies a potential Gatekeeper bypass. In macOS, when applications or programs are downloaded from the internet, there is a quarantine flag set on the file. This attribute is read by Apple's Gatekeeper defense program at execution time. An adversary may disable this attribute to evade defenses.\n\n\nprocess where host.os.type == \"macos\" and event.type in (\"start\", \"process_started\") and\n  process.name : \"xattr\" and\n  (\n    (process.args : \"com.apple.quarantine\" and process.args : (\"-d\", \"-w\")) or\n    (process.args : \"-c\") or\n    (process.command_line : (\"/bin/bash -c xattr -c *\", \"/bin/zsh -c xattr -c *\", \"/bin/sh -c xattr -c *\"))\n  ) and not process.args_count > 12"
-  }
-]
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_labs.json b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_labs.json
deleted file mode 100644
index c8ac5ec490a77..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_labs.json
+++ /dev/null
@@ -1,38 +0,0 @@
-[
-  {
-    "input": "What is BLISTER malware?",
-    "reference": "BLISTER, a malware loader initially discovered by Elastic Security Labs in 2021 and associated with financially-motivated intrusions, is a testament to this trend as it continues to develop additional capabilities. Two years after its initial discovery, BLISTER continues to receive updates while flying under the radar, gaining momentum as an emerging threat. Recent findings from Palo Alto’s Unit 42 describe an updated SOCGHOLISH infection chain used to distribute BLISTER and deploy a payload from MYTHIC, an open-source Command and Control (C2) framework.\n\nView the full research on Elastic Security Labs"
-  },
-  {
-    "input": "What detection rules does Elastic Security Labs provide for ICEDID?",
-    "reference": "ICEDID is a malware family first described in 2017 by IBM X-force researchers and is associated with the theft of login credentials, banking information, and other personal information. ICEDID has always been a prevalent family, but has achieved even more growth since EMOTET’s temporary disruption in early 2021. ICEDID has been linked to the distribution of other distinct malware families including DarkVNC and COBALT STRIKE. Regular industry reporting, including research publications like this one, help mitigate this threat.\n\nDetections and preventions\nDetection logic\nEnumeration of Administrator Accounts\nCommand Shell Activity Started via RunDLL32\nSecurity Software Discovery using WMIC\nSuspicious Execution from a Mounted Device\nWindows Network Enumeration\nUnusual DLL Extension Loaded by Rundll32 or Regsvr32\nSuspicious Windows Script Interpreter Child Process\nRunDLL32 with Unusual Arguments\nPreventions (source: https://github.com/elastic/protections-artifacts/)\nMalicious Behavior Detection Alert: Command Shell Activity\nMemory Threat Detection Alert: Shellcode Injection\nMalicious Behavior Detection Alert: Unusual DLL Extension Loaded by Rundll32 or Regsvr32\nMalicious Behavior Detection Alert: Suspicious Windows Script Interpreter Child Process\nMalicious Behavior Detection Alert: RunDLL32 with Unusual Arguments\nMalicious Behavior Detection Alert: Windows Script Execution from Archive File\nYARA\nElastic Security has created multiple YARA rules related to the different stages/components within ICEDID infection, these can be found in the signature linked below: - Windows.Trojan.ICEDID\n\nMore information can be found in this Elastic Security Labs post."
-  },
-  {
-    "input": "Can you provide a code analysis breakdown of the SPECTRALVIPER malware variant?",
-    "reference": "SPECTRALVIPER code analysis\nOverview\nDuring our investigation, we observed a previously-undiscovered backdoor malware family that we’re naming SPECTRALVIPER. SPECTRALVIPER is a 64-bit Windows backdoor coded in C++ and heavily obfuscated. It operates with two distinct communication modes, allowing it to receive messages either via HTTP or a Windows named pipe.\n\nThrough our analysis, we have identified the following capabilities:\n\nPE loading/Injection : SPECTRALVIPER can load and inject executable files, supporting both x86 and x64 architectures. This capability enables it to execute malicious code within legitimate processes.\nToken Impersonation : The malware possesses the ability to impersonate security tokens, granting it elevated privileges and bypassing certain security measures. This enables unauthorized access and manipulation of sensitive resources.\nFile downloading/uploading : SPECTRALVIPER can download and upload files to and from the compromised system. This allows the attacker to exfiltrate data or deliver additional malicious payloads to the infected machine.\nFile/directory manipulation : The backdoor is capable of manipulating files and directories on the compromised system. This includes creating, deleting, modifying, and moving files or directories, providing the attacker with extensive control over the victim's file system.\n\nFor more information, visit the Elastic Security Labs post."
-  },
-  {
-    "input": "Can you describe the threat that PHOREAL malware presents to an organization?",
-    "reference": "PHOREAL/RIZZO is a backdoor allowing initial victim characterization and follow-on post-exploitation operations to compromise the confidentiality of organizations’ data. It has been reported in other research as being used exclusively by APT32 (AKA SeaLotus, OceanLotus, APT-C-00, Group G0050).\n\nFor more information, see this Elastic Security Labs post."
-  },
-  {
-    "input": "Can you give an example of ransomware that attempts to wipe a host master boot record?",
-    "reference": "One such example observed by Elastic Security Labs is WhisperGate, part of the Bleeding Bear malware campaign. Ths SHA256 hash of this sample is a196c6b8ffcb97ffb276d04f354696e2391311db3841ae16c8c9f56f36a38e92. For more information, please view the detailed post on Elastic Security Labs."
-  },
-  {
-    "input": "How does Elastic Security Labs use kernel call stacks to detect threats living in memory?",
-    "reference": "With Elastic Security 8.8, Elastic Security Labs added new kernel call stack based detections which provide improved efficacy against in-memory threats. A call stack is the ordered sequence of functions that are executed to achieve a behavior of a program. It shows in detail which functions (and their associated modules) were executed to lead to a behavior like a new file or process being created. Knowing a behavior’s call stack, we can build detections with detailed contextual information about what a program is doing and how it’s doing it. The new call stack based detection capability leverages our existing deep in-line kernel visibility for the most common system behaviors (process, file, registry, library, etc). With each event, we capture the call stack for the activity. This is later enriched with module information, symbols, and evidence of suspicious activity. This gives Elastic Defend procmon-like visibility in real-time, powering advanced preventions for in-memory tradecraft.\n\nView this post for more information."
-  },
-  {
-    "input": "What were some of Elastic Security Labs key findings in their 2023 Global Threat Report?",
-    "reference": "Impairing defenses by tampering with cloud logging functionality was one of the most common techniques observed in the later part of 2022 and continues into 2023 - This likely impacted visibility of other techniques due to missing data sources, and is potentially a reaction to improvements in cloud logging - XMRig prevalence exploded on MacOS, likely as a result of macroeconomic conditions. The full 2023 Spring report can be found here."
-  },
-  {
-    "input": "Is Elastic Security Labs tracking any malware that targets macOS systems?",
-    "reference": "Yes, one such malware variant is RUSTBUCKET. RUSTBUCKET adds persistence capabilities not previously observed and, at the time of reporting, is undetected by VirusTotal signature engines. Elastic Defend behavioral and prebuilt detection rules provide protection and visibility for users. We have also released a signature to prevent this malware execution.\n\nThe research into REF9135 used host, binary, and network analysis to identify and attribute intrusions observed by this research team, and other intelligence groups, with high confidence to the Lazarus Group; a cybercrime and espionage organization operated by the Democratic People’s Republic of North Korea (DPRK).\n\nView this post for the full research article."
-  },
-  {
-    "input": "How does Elastic Defend help agaist threats like BPFDoor?",
-    "reference": "BPFDoor is a backdoor payload specifically crafted for Linux. Its purpose is for long-term persistence in order to gain re-entry into a previously or actively compromised target environment. It notably utilizes BPF along with a number of other techniques to achieve this goal, taking great care to be as efficient and stealthy as possible. \n\nElastic Security Labs has several detections that can detect and prevent BPDoor. They have also published Yara signatures for it.\n\nThe following Elastic Detection Rules will identify BPFDoor activity:\n\nAbnormal Process ID or Lock File Created\nBinary Executed from Shared Memory Directory\n\nThe yara signature can be found here.\n\nMore details on BPFDoor can be found in this Elastic Security Labs post."
-  }
-]
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_questions_dataset.json b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_questions_dataset.json
deleted file mode 100644
index aaa2494018b0a..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/datasets/security_questions_dataset.json
+++ /dev/null
@@ -1,54 +0,0 @@
-[
-  {
-    "input": "How do I install Elastic Agent to collect events from my windows systems?",
-    "reference": "To install an Elastic Agent and enroll it in Fleet:\n\nIn Kibana, go to Fleet > Agents, and click Add agent.\nIn the Add agent flyout, select an existing agent policy or create a new one. If you create a new policy, Fleet generates a new Fleet enrollment token.\nFor on-premises deployments, you can dedicate a policy to all the agents in the network boundary and configure that policy to include a specific Fleet Server (or a cluster of Fleet Servers).\nMake sure Enroll in Fleet is selected.\nDownload, install, and enroll the Elastic Agent on your host by selecting your host operating system and following the Install Elastic Agent on your host step."
-  },
-  {
-    "input": "How would I run Malware prevention via Elastic Defend on my macOS systems?",
-    "reference": "Elastic Defend malware prevention detects and stops malicious attacks by using a machine learning model that looks for static attributes to determine if a file is malicious or benign. You will need to create a fleet policy with the Elastic Defend integration.\n\nBy default, malware protection is enabled on Windows, macOS, and Linux hosts. To disable malware protection, switch the Malware protections enabled toggle off.\n\nMalware protection levels are:\n\nDetect: Detects malware on the host and generates an alert. The agent will not block malware. You must pay attention to and analyze any malware alerts that are generated.\nPrevent (Default): Detects malware on the host, blocks it from executing, and generates an alert.\nSelect Notify user to send a push notification in the host operating system when activity is detected or prevented. Notifications are enabled by default for the Prevent option.\n\nPlatinum and Enterprise customers can customize these notifications using the Elastic Security {action} {filename} syntax.\nMalware protection also allows you to manage a blocklist to prevent specified applications from running on hosts, extending the list of processes that Elastic Defend considers malicious. Use the Blocklist enabled toggle to enable or disable this feature for all hosts associated with the integration policy. To configure the blocklist, refer to Blocklist.\n\nWhen Prevent is enabled for malware protection, Elastic Defend will quarantine any malicious file it finds. Specifically Elastic Defend will remove the file from its current location, encrypt it with the encryption key ELASTIC, move it to a different folder, and rename it as a GUID string, such as 318e70c2-af9b-4c3a-939d-11410b9a112c.\n\nThe quarantine folder location varies by operating system:\n\nmacOS: /System/Volumes/Data/.equarantine\nLinux: .equarantine at the root of the mount point of the file being quarantined\nWindows - Elastic Defend versions 8.5 and later: [DriveLetter:]\\.quarantine, unless the files are from the C: drive. These files are moved to C:\\Program Files\\Elastic\\Endpoint\\state\\.equarantine.\nWindows - Elastic Defend versions 8.4 and earlier: [DriveLetter:]\\.quarantine, for any drive\nTo restore a quarantined file to its original state and location, add an exception to the rule that identified the file as malicious. If the exception would’ve stopped the rule from identifying the file as malicious, Elastic Defend restores the file.\n\nVisit the documentation for further guidance."
-  },
-  {
-    "input": "How do I build a visualization in Elastic Security to display to top 10 processes with outbound traffic over time?",
-    "reference": "Visualization in Elastic Security are powered by Lens. To build a new Lens visualization, go the the Analytics menu in Kibana, select \"visualize library\" and click on the \"create visualization\" button. Select the \"Lens\" tile. Pick your chart type, and in the Lens layer, \n\nSelect the correct data view that contains your network/process events from the top right. The \"Horizontal axis\" field should contain \"@timestamp\". The vertical axis should contain a metric which is the \"sum of destination.bytes\". The breakdown section should be the \"top 10 values of process.name\". Visit the lens documentation for further information."
-  },
-  {
-    "input": "How do I configure a generated alert to send an e-mail notification to my security operations center in Elastic Security?",
-    "reference": "This is acheived by using alert actions on the Elastic Security detection rule in question.\n\nNavigate to the Alerts page in Elastic Security and click on the \"Manage Rules\" button. Select the detection rule(s) you would like to set up th e-mail action for. Select the connector type (in this case, e-mail), set its action frequency and any additional conditions. Add the body content that you would like to include in any e-mails that are sent. When ready, save the changes to the rule. Visit the documentation for more information."
-  },
-  {
-    "input": "What would be the best way to add an exception for my Elastic Security rules not to trigger when the host of any given event is within a specified list?",
-    "reference": "Value lists hold multiple values of the same Elasticsearch data type, such as IP addresses, which are used to determine when an exception prevents an alert from being generated. You can use value lists to define exceptions for detection rules; however, you cannot use value lists to define endpoint rule exceptions.\n\nAfter creating value lists, you can use is in list and is not in list operators to define exceptions.\n\nTo create a value list:\n\nPrepare a txt or csv file with all the values you want to use for determining exceptions from a single list. If you use a txt file, new lines act as delimiters.\nGo to Manage → Rules.\nClick Import value lists. The Import value lists window opens.\nSelect the list type (Keywords, IP addresses, IP ranges, or Text) from the Type of value list drop-down.\nDrag or select the csv or txt file that contains the values.\nClick Import value list.\n\nFor more information on value lists, please reference the documentation"
-  },
-  {
-    "input": "I need to ingest GCP Audit logs for use within Elastic Security. Please provide instructions on how to do this.",
-    "reference": "There is a pre built Elastic Agent that will allow you to ingest GCP Audit logs easily. Once the audit logs have been configured within the Google cloud console accordingly, proceed to the Fleet management menu in kibana. Go to integrations, and add the \"Google Cloud Platform (GCP) Audit logs\" integration to a new or existing policy. You will need to supply the credentials for the specfic Google Cloud Project, the pub-sub topic and subscription name. Once that is done, the changes will be applied to the hosts that are assigned to the policy you just added the integration to. Reference the documentation for further information about this integration."
-  },
-  {
-    "input": "Please provide instructions on how I can add screenshots to a case within Elastic Security",
-    "reference": "Once you have your screenshots saved as files on your local system, you can upload these via the files tab within an Elastic Security case. You can set file types and sizes by configuring your Kibana case settings.\n\nWhen you add a file, a comment is added to the case activity log. To view an image, click its name in the activity or file list.\n\nImages with supported image types can be rendered within the case simply by clicking on the file name. Reference the documentation for further information."
-  },
-  {
-    "input": "How can I terminate/kill a malicious process that I've identified on one of my hosts via Elastic Defend?",
-    "reference": "Elastic Defend allows you to run response actions via the dedicated response console.\n\nLaunch the response console from any of the following places in Elastic Security:\n\nEndpoints page → Actions menu (…​) → Respond\nEndpoint details flyout → Take action → Respond\nAlert details flyout → Take action → Respond\n\nTo perform an action on the endpoint, enter a response action command in the input area at the bottom of the console, then press Return. Output from the action is displayed in the console.\n\nIf a host is unavailable, pending actions will execute once the host comes online. Pending actions expire after two weeks and can be tracked in the response actions history.\n\nActivity in the response console is persistent, so you can navigate away from the page and any pending actions you’ve submitted will continue to run. To confirm that an action completed, return to the response console to view the console output or check the response actions history.\n\nUse the \"kill-process\" command to terminate a process. You must include one of the following parameters to identify the process to terminate:\n\n--pid : A process ID (PID) representing the process to terminate.\n--entityId : An entity ID representing the process to terminate.\nRequired privilege: Process Operations\n\nExample: kill-process --pid 123 --comment \"Terminate suspicious process\"\n\nPlease reference the documentation for more information."
-  },
-  {
-    "input": "I need to adjust the data retention policy for my instance of Elastic Security. Please provide instructions on how to do this.",
-    "reference": "Assuming you are using the default policy for Elastic Agent data streams, you can change the default policy as you require.\n\nTo view the logs policy in Kibana:\n\nOpen the menu and go to Stack Management > Index Lifecycle Policies.\nSelect Include managed system policies.\nSelect the logs policy.\n\nThe default logs policy is designed to prevent the creation of many tiny daily indices. You can modify the policy to meet your performance requirements and manage resource usage.\n\nAs an example, to activate the warm phase, tick the option for the Warm Phase. and click Advanced settings.\n\nSet Move data into phase when to 30 days old. This moves indices to the warm tier 30 days after rollover.\nEnable Set replicas and change Number of replicas to 1.\nEnable Force merge data and set Number of segments to 1.\n\nYou can change each phase of the policy in the same way. For more information, view the documentation."
-  },
-  {
-    "input": "Can you explain what a \"New terms\" rule type is in context of the Elastic Security Detection engine?",
-    "reference": "A new terms rule generates an alert for each new term detected in source documents within a specified time range. You can also detect a combination of up to three new terms (for example, a host.ip and host.id that have never been observed together before). View the documentation for more information about this rule type."
-  },
-  {
-    "input": "What threat intelligence feeds can I use natively within Elastic Security?",
-    "reference": "Elastic Security supports several threat intelligence feeds natively via Elastic Agent integrations. Here is the full list of providers. Each provider can have several different feeds supported by the integration:\n\n- AbuseCH\n- AlienVault OTX\n- Anomali\n- Cyberark\n- Cybersixgill\n- Maltiverse\n- Rapid7 Threat Command\n- Recorded Future\n- Threat Quotient\n\nElastic Agent can also integrate with threat intelligence platforms such as MISP and Collective Intelligence Framework."
-  },
-  {
-    "input": "When a specific alert triggers, I need to collect the security patches that are installed on the system at that point in time. How would I do this within Elastic Security?",
-    "reference": "Elastic Security has native support for OSQuery Management via Elastic Agent. OSQuery can be invoked to run a specified query whenever an alert rule triggers. In this case, OSQuery can be queried for the table within its schema for security patches.\n\nYou can add Osquery Response Actions to new or existing custom query rules. Queries run every time the rule executes.\n\nChoose one of the following:\n\nNew rule: When you are on the last step of custom query rule creation, go to the Response Actions section and click the Osquery icon.\nExisting rule: Edit the rule’s settings, then go to the Actions tab. In the tab, click the Osquery icon under the Response Actions section.\n\n\nSpecify whether you want to set up a single live query or a pack:\n\nQuery: Select a saved query or enter a new one. After you enter the query, you can expand the Advanced section to view or set mapped ECS fields included in the results from the live query. Mapping ECS fields is optional.\n\nYou can use placeholder fields to dynamically add alert data to your query.\nPack: Select from available query packs. After you select a pack, all of the queries in the pack are displayed.\n\nClick the Osquery icon to add more live queries (optional).\nClick Create & enable rule (for a new rule) or Save changes (for existing rules) to finish adding the queries.\n\nIn this case, the query in question will be - \"\"select * from patches where description == \"\"Security Update\"\"\"\"\n\nFor more information about running OSQuery responses, view the documentation."
-  },
-  {
-    "input": "Where would I raise an issue/request for Elastic Security detections and preventions?",
-    "reference": "Issues and requests for Elastic Security detections can be raised in https://github.com/elastic/detection-rules. The repository for protections is https://github.com/elastic/protections-artifacts."
-  }
-]
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
deleted file mode 100644
index 93f164835876a..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/evaluation.ts
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { loadEvaluator } from 'langchain/evaluation';
-import { LLM } from '@langchain/core/language_models/llms';
-import { ChainValues } from '@langchain/core/utils/types';
-import { HumanMessage } from '@langchain/core/messages';
-import { chunk as createChunks } from 'lodash/fp';
-import { Logger } from '@kbn/core/server';
-import { ToolingLog } from '@kbn/tooling-log';
-import { LangChainTracer } from '@langchain/core/tracers/tracer_langchain';
-import { RunCollectorCallbackHandler } from '@langchain/core/tracers/run_collector';
-import { Dataset } from '@kbn/elastic-assistant-common';
-import { isLangSmithEnabled } from '@kbn/langchain/server/tracers/langsmith';
-import { AgentExecutorEvaluatorWithMetadata } from '../langchain/executors/types';
-import { callAgentWithRetry, getMessageFromLangChainResponse } from './utils';
-import { writeLangSmithFeedback } from '../../routes/evaluate/utils';
-import { ResponseBody } from '../langchain/types';
-
-export interface PerformEvaluationParams {
-  agentExecutorEvaluators: AgentExecutorEvaluatorWithMetadata[];
-  dataset: Dataset;
-  evaluationId: string;
-  evaluatorModel?: LLM;
-  evaluationPrompt?: string;
-  evaluationType?: string;
-  logger: Logger | ToolingLog;
-  maxConcurrency?: number;
-  runName?: string;
-}
-
-export interface EvaluationResult {
-  '@timestamp': string;
-  connectorName: string;
-  evaluation: ChainValues;
-  evaluationId: string;
-  input: string;
-  inputExampleId?: string | undefined;
-  langSmithLink?: string | undefined;
-  prediction: string;
-  predictionResponse: PromiseSettledResult<ResponseBody>;
-  reference: string;
-  runName: string;
-}
-
-export interface EvaluationSummary {
-  '@timestamp': string;
-  evaluationStart: number;
-  evaluationEnd: number;
-  evaluationId: string;
-  evaluationDuration: number;
-  langSmithLink?: string | undefined;
-  runName: string;
-  totalAgents: number;
-  totalRequests: number;
-  totalInput: number;
-}
-
-/**
- * Evaluates a dataset based on an evaluation rubric. Takes dataset of input/reference pairs,
- * and fetches the output (prediction) of the input against the provided agent executors.
- * Then evaluates all three using the provided evaluation rubric.
- */
-export const performEvaluation = async ({
-  agentExecutorEvaluators,
-  dataset,
-  evaluationId,
-  evaluatorModel,
-  evaluationPrompt,
-  evaluationType,
-  maxConcurrency = 1,
-  logger,
-  runName = 'default-run-name',
-}: PerformEvaluationParams) => {
-  const startTime = new Date().getTime();
-  const evaluationResults: EvaluationResult[] = [];
-
-  const predictionRequests = dataset.flatMap(({ input, reference, id: exampleId }) =>
-    agentExecutorEvaluators.map(
-      ({ agentEvaluator: agent, metadata: { connectorName, runName: agentRunName } }) => ({
-        connectorName,
-        input,
-        reference,
-        exampleId,
-        request: () =>
-          callAgentWithRetry({ agent, exampleId, messages: [new HumanMessage(input)], logger }),
-        runName: agentRunName,
-      })
-    )
-  );
-
-  const requestChunks = createChunks(maxConcurrency, predictionRequests);
-  const totalChunks = requestChunks.length;
-
-  logger.info(`Total prediction requests: ${predictionRequests.length}`);
-  logger.info(`Chunk size (maxConcurrency): ${maxConcurrency}`);
-  logger.info(`Total chunks: ${totalChunks}`);
-  logger.info('Fetching predictions...');
-
-  while (requestChunks.length) {
-    const chunk = requestChunks.shift() ?? [];
-    const chunkNumber = totalChunks - requestChunks.length;
-    logger.info(`Prediction request chunk: ${chunkNumber} of ${totalChunks}`);
-
-    // Note, order is kept between chunk and dataset, and is preserved w/ Promise.allSettled
-    const chunkResults = await Promise.allSettled(chunk.map((r) => r.request()));
-    logger.info(
-      `Prediction request chunk ${chunkNumber} response:\n${JSON.stringify(chunkResults)}`
-    );
-    chunkResults.forEach((response, chunkResultIndex) =>
-      evaluationResults.push({
-        '@timestamp': new Date().toISOString(),
-        connectorName: chunk[chunkResultIndex].connectorName,
-        input: chunk[chunkResultIndex].input,
-        inputExampleId: chunk[chunkResultIndex].exampleId,
-        reference: chunk[chunkResultIndex].reference,
-        evaluationId,
-        evaluation: {},
-        prediction: getMessageFromLangChainResponse(response),
-        predictionResponse: response,
-        runName: chunk[chunkResultIndex].runName,
-      })
-    );
-  }
-
-  logger.info(`Prediction results:\n${JSON.stringify(evaluationResults)}`);
-
-  if (evaluatorModel == null) {
-    const endTime = new Date().getTime();
-
-    const evaluationSummary: EvaluationSummary = {
-      evaluationId,
-      '@timestamp': new Date().toISOString(),
-      evaluationStart: startTime,
-      evaluationEnd: endTime,
-      evaluationDuration: endTime - startTime,
-      runName,
-      totalAgents: agentExecutorEvaluators.length,
-      totalInput: dataset.length,
-      totalRequests: predictionRequests.length,
-    };
-
-    logger.info(`Final results:\n${JSON.stringify(evaluationResults)}`);
-
-    return { evaluationResults, evaluationSummary };
-  }
-
-  // Continue with actual evaluation if expected
-  logger.info('Performing evaluation....');
-  logger.info(`Evaluation model: ${evaluatorModel._llmType()}`);
-
-  if (evaluationType === 'correctness') {
-    logger.info('Evaluation type: correctness');
-    const evaluator = await loadEvaluator('labeled_criteria', {
-      criteria: 'correctness',
-      llm: evaluatorModel,
-    });
-
-    for (const result of evaluationResults) {
-      const { input, inputExampleId: exampleId, prediction, reference } = result;
-      // Create an eval tracer so eval traces end up in the right project (runName in this instance as to correlate
-      // with the test run), don't supply `exampleID` as that results in a new Dataset `Test` run being created and
-      // polluting the `predictions` that ran above
-      const evalTracer = new LangChainTracer({
-        projectName: runName,
-      });
-      // Create RunCollector for uploading evals to LangSmith, no TS variant for `EvaluatorCallbackHandler` or
-      // `run_on_dataset` w/ eval config, so using `RunCollectorCallbackHandler` and then uploading manually via
-      // client.createFeedback()
-      // See: https://github.com/langchain-ai/langsmith-sdk/blob/18449e5848d85ac0a320f320c37f454f949de1e1/js/src/client.ts#L1249-L1256
-      const runCollector = new RunCollectorCallbackHandler({ exampleId });
-      const evaluation = await evaluator.evaluateStrings(
-        {
-          input,
-          prediction,
-          reference,
-        },
-        {
-          callbacks: [...(isLangSmithEnabled() ? [evalTracer, runCollector] : [])],
-          tags: ['security-assistant-evaluation'],
-        }
-      );
-      result.evaluation = evaluation;
-
-      // Write to LangSmith
-      if (isLangSmithEnabled()) {
-        const langSmithLink = await writeLangSmithFeedback(
-          runCollector.tracedRuns[0],
-          evaluationId,
-          logger
-        );
-        result.langSmithLink = langSmithLink;
-      }
-    }
-  } else if (evaluationType === 'esql-validator') {
-    logger.info('Evaluation type: esql-validator');
-    // TODO: Implement esql-validator here
-  } else if (evaluationType === 'custom') {
-    logger.info('Evaluation type: custom');
-    // TODO: Implement custom evaluation here
-    // const llm = new ChatOpenAI({ temperature: 0, tags: ["my-llm-tag"] });
-    // const prompt = PromptTemplate.fromTemplate("Say {input}");
-    // const chain = prompt.pipe(llm).withConfig( { tags: ["my-bash-tag", "another-tag"] });
-    // await chain.invoke({ input: "Hello, World!"}, { tags: ["shared-tags"] });
-  }
-
-  const endTime = new Date().getTime();
-
-  const evaluationSummary: EvaluationSummary = {
-    evaluationId,
-    '@timestamp': new Date().toISOString(),
-    evaluationStart: startTime,
-    evaluationEnd: endTime,
-    evaluationDuration: endTime - startTime,
-    runName,
-    totalAgents: agentExecutorEvaluators.length,
-    totalInput: dataset.length,
-    totalRequests: predictionRequests.length,
-  };
-
-  logger.info(`Final results:\n${JSON.stringify(evaluationResults)}`);
-
-  return { evaluationResults, evaluationSummary };
-};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/mappings.ts b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/mappings.ts
deleted file mode 100644
index 6040404649ba8..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/mappings.ts
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
-
-export const evaluationIndexMappings: MappingTypeMapping = {
-  properties: {
-    '@timestamp': {
-      type: 'date',
-    },
-    evaluation: {
-      properties: {
-        reasoning: {
-          type: 'text',
-          fields: {
-            keyword: {
-              type: 'keyword',
-              ignore_above: 1024,
-            },
-          },
-        },
-        score: {
-          type: 'long',
-        },
-        value: {
-          type: 'text',
-        },
-      },
-    },
-    evaluationId: {
-      type: 'text',
-      fields: {
-        keyword: {
-          type: 'keyword',
-          ignore_above: 1024,
-        },
-      },
-    },
-    evaluationStart: {
-      type: 'long',
-    },
-    evaluationEnd: {
-      type: 'long',
-    },
-    evaluationDuration: {
-      type: 'long',
-    },
-    input: {
-      type: 'text',
-      fields: {
-        keyword: {
-          type: 'keyword',
-          ignore_above: 1024,
-        },
-      },
-    },
-    prediction: {
-      type: 'text',
-      fields: {
-        keyword: {
-          type: 'keyword',
-          ignore_above: 1024,
-        },
-      },
-    },
-    predictionResponse: {
-      properties: {
-        status: {
-          type: 'text',
-        },
-        value: {
-          properties: {
-            connector_id: {
-              type: 'text',
-            },
-            data: {
-              type: 'text',
-              fields: {
-                keyword: {
-                  type: 'keyword',
-                  ignore_above: 1024,
-                },
-              },
-            },
-            status: {
-              type: 'text',
-            },
-          },
-        },
-      },
-    },
-    reference: {
-      type: 'text',
-      fields: {
-        keyword: {
-          type: 'keyword',
-          ignore_above: 1024,
-        },
-      },
-    },
-    totalAgents: {
-      type: 'long',
-    },
-    totalInput: {
-      type: 'long',
-    },
-    totalRequests: {
-      type: 'long',
-    },
-  },
-};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/utils.ts b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/utils.ts
deleted file mode 100644
index 61b410df49aa0..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/output_index/utils.ts
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
-import { Logger } from '@kbn/logging';
-import { ToolingLog } from '@kbn/tooling-log';
-import { evaluationIndexMappings as mappings } from './mappings';
-import { EvaluationResult, EvaluationSummary } from '../evaluation';
-
-interface SetupIndexParams {
-  esClient: ElasticsearchClient;
-  index: string;
-  logger: Logger | ToolingLog;
-}
-
-/**
- * Sets up the output index for the model evaluator. Creates index with mappings
- * if not already exists
- *
- * @param {Object} options - The options object.
- * @param {ElasticsearchClient} options.esClient Elasticsearch client
- * @param {string} options.index Name of the output index
- *
- * @returns {Promise<boolean>} True if index exists or created successfully
- */
-export const setupEvaluationIndex = async ({
-  esClient,
-  index,
-  logger,
-}: SetupIndexParams): Promise<boolean> => {
-  // Check if index exists
-  const indexExists = await esClient.indices.exists({ index });
-  if (indexExists) {
-    logger.info(`Index "${index}" already exists`);
-    return true;
-  }
-
-  // Create index with default eval mappings if not exists
-  const settings = {};
-  const response = await esClient.indices.create({
-    index,
-    mappings,
-    settings,
-  });
-
-  if (response.acknowledged) {
-    logger.info(`Created index "${index}"`);
-  } else {
-    logger.error(`Error creating index "${index}"`);
-  }
-
-  return response.acknowledged;
-};
-
-interface IndexEvaluationsParams {
-  esClient: ElasticsearchClient;
-  evaluationResults: EvaluationResult[];
-  evaluationSummary: EvaluationSummary;
-  index: string;
-  logger: Logger | ToolingLog;
-}
-
-/**
- * Indexes evaluation results into the output index
- * @param {Object} options - The options object.
- * @param {ElasticsearchClient} options.esClient Elasticsearch client
- * @param {EvaluationResult[]} options.evaluationResults Individual eval results
- * @param {EvaluationResult[]} options.evaluationSummary Summary of eval
- * @param {string} options.index Name of the output index
- *
- * @returns {Promise<boolean>} True if documents created successfully
- */
-export const indexEvaluations = async ({
-  esClient,
-  evaluationResults,
-  evaluationSummary,
-  index,
-  logger,
-}: IndexEvaluationsParams): Promise<boolean> => {
-  try {
-    const response = await esClient.helpers.bulk({
-      datasource: evaluationResults,
-      onDocument(doc) {
-        return { index: { _index: index } };
-      },
-    });
-
-    logger.info(`Writing evaluations...`);
-    logger.info(`Evaluations bulk index response:\n${JSON.stringify(response)}`);
-
-    logger.info(`Writing summary...`);
-    const summaryResponse = await esClient.index({ index, document: evaluationSummary });
-    logger.info(`Summary index response:\n${JSON.stringify(summaryResponse)}`);
-
-    return true;
-  } catch (e) {
-    logger.error(`Error indexing data into the evaluation index\n${e}`);
-    return false;
-  }
-};
diff --git a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts b/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
deleted file mode 100644
index 5e39b7e0bf90e..0000000000000
--- a/x-pack/plugins/elastic_assistant/server/lib/model_evaluator/utils.ts
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { Logger } from '@kbn/logging';
-import { ToolingLog } from '@kbn/tooling-log';
-import { BaseMessage } from '@langchain/core/messages';
-import { ResponseBody } from '../langchain/types';
-import { AgentExecutorEvaluator } from '../langchain/executors/types';
-
-export const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
-
-export interface CallAgentWithRetryParams {
-  agent: AgentExecutorEvaluator;
-  exampleId?: string;
-  messages: BaseMessage[];
-  logger: Logger | ToolingLog;
-  maxRetries?: number;
-}
-export const callAgentWithRetry = async ({
-  agent,
-  exampleId,
-  messages,
-  logger,
-  maxRetries = 3,
-}: CallAgentWithRetryParams) => {
-  for (let attempt = 0; attempt < maxRetries; attempt++) {
-    try {
-      return await agent(messages, exampleId);
-    } catch (error) {
-      // Check for 429, and then if there is a retry-after header
-      const { isRateLimitError, retryAfter } = parseErrorMessage(error);
-      if (isRateLimitError) {
-        logger.error(
-          "callAgentWithRetry: Slow down! You're going too fast! 429 detected! Retrying after..."
-        );
-        if (retryAfter != null) {
-          logger.error(`${retryAfter} seconds`);
-          await wait(retryAfter * 1000);
-          // eslint-disable-next-line no-continue
-          continue;
-        }
-      }
-      // If not 429 or there is no retry-after header, reject the promise
-      logger.error(`Error calling agent:\n${error}`);
-      return Promise.reject(error);
-    }
-  }
-  logger.error(`callAgentWithRetry: Max retries reached: ${maxRetries}`);
-  // Reject and keep going!
-  // eslint-disable-next-line prefer-promise-reject-errors
-  return Promise.reject(`callAgentWithRetry: Max retries reached: ${maxRetries}`);
-};
-
-export const getMessageFromLangChainResponse = (
-  response: PromiseSettledResult<ResponseBody>
-): string => {
-  if (response.status === 'fulfilled' && response.value.data != null) {
-    return response.value.data;
-  }
-  return 'error';
-};
-
-/**
- * Parse an error message coming back from the agent via the actions frameworks to determine if it is
- * a rate limit error and extract the retry after delay.
- *
- * Note: Could be simplified by instrumenting agents w/ callback where there's access to the actual response
- * @param error
- */
-export const parseErrorMessage = (
-  error: Error
-): { isRateLimitError: boolean; retryAfter: number | null } => {
-  const errorMessage: string = error.message;
-
-  const rateLimitRegex = /Status code: 429.*?Please retry after (\d+) seconds/;
-  const match = errorMessage.match(rateLimitRegex);
-
-  // If there is a match, return the parsed delay; otherwise, return an indication that it is not a 429 error.
-  if (match && match[1]) {
-    return { isRateLimitError: true, retryAfter: parseInt(match[1], 10) };
-  } else {
-    return { isRateLimitError: false, retryAfter: null };
-  }
-};
diff --git a/x-pack/plugins/elastic_assistant/server/routes/evaluate/get_evaluate.ts b/x-pack/plugins/elastic_assistant/server/routes/evaluate/get_evaluate.ts
index b62cfd7640cb8..41b455a73598b 100644
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/get_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/get_evaluate.ts
@@ -10,21 +10,22 @@ import { transformError } from '@kbn/securitysolution-es-utils';
 
 import {
   API_VERSIONS,
+  ELASTIC_AI_ASSISTANT_EVALUATE_URL,
   INTERNAL_API_ACCESS,
   GetEvaluateResponse,
 } from '@kbn/elastic-assistant-common';
 import { buildRouteValidationWithZod } from '@kbn/elastic-assistant-common/impl/schemas/common';
 import { buildResponse } from '../../lib/build_response';
 import { ElasticAssistantRequestHandlerContext } from '../../types';
-import { EVALUATE } from '../../../common/constants';
-import { DEFAULT_PLUGIN_NAME, getPluginNameFromRequest } from '../helpers';
-import { AGENT_EXECUTOR_MAP } from '../../lib/langchain/executors';
+import { performChecks } from '../helpers';
+import { ASSISTANT_GRAPH_MAP } from '../../lib/langchain/graphs';
+import { fetchLangSmithDatasets } from './utils';
 
 export const getEvaluateRoute = (router: IRouter<ElasticAssistantRequestHandlerContext>) => {
   router.versioned
     .get({
       access: INTERNAL_API_ACCESS,
-      path: EVALUATE,
+      path: ELASTIC_AI_ASSISTANT_EVALUATE_URL,
       options: {
         tags: ['access:elasticAssistant'],
       },
@@ -41,22 +42,28 @@ export const getEvaluateRoute = (router: IRouter<ElasticAssistantRequestHandlerC
         },
       },
       async (context, request, response): Promise<IKibanaResponse<GetEvaluateResponse>> => {
-        const assistantContext = await context.elasticAssistant;
-        const logger = assistantContext.logger;
+        const ctx = await context.resolve(['core', 'elasticAssistant', 'licensing']);
+        const assistantContext = ctx.elasticAssistant;
+        const logger = assistantContext.logger.get('evaluate');
 
-        // Validate evaluation feature is enabled
-        const pluginName = getPluginNameFromRequest({
+        // Perform license, authenticated user and evaluation FF checks
+        const checkResponse = performChecks({
+          authenticatedUser: true,
+          capability: 'assistantModelEvaluation',
+          context: ctx,
+          license: true,
           request,
-          defaultPluginName: DEFAULT_PLUGIN_NAME,
-          logger,
+          response,
         });
-        const registeredFeatures = assistantContext.getRegisteredFeatures(pluginName);
-        if (!registeredFeatures.assistantModelEvaluation) {
-          return response.notFound();
+        if (checkResponse) {
+          return checkResponse;
         }
 
+        // Fetch datasets from LangSmith // TODO: plumb apiKey so this will work in cloud w/o env vars
+        const datasets = await fetchLangSmithDatasets({ logger });
+
         try {
-          return response.ok({ body: { agentExecutors: Object.keys(AGENT_EXECUTOR_MAP) } });
+          return response.ok({ body: { graphs: Object.keys(ASSISTANT_GRAPH_MAP), datasets } });
         } catch (err) {
           logger.error(err);
           const error = transformError(err);
diff --git a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.test.ts b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.test.ts
index b3fb27fa835c1..f5a9618da19a1 100644
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.test.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.test.ts
@@ -9,36 +9,44 @@ import { postEvaluateRoute } from './post_evaluate';
 import { serverMock } from '../../__mocks__/server';
 import { requestContextMock } from '../../__mocks__/request_context';
 import { getPostEvaluateRequest } from '../../__mocks__/request';
+import { elasticsearchServiceMock } from '@kbn/core-elasticsearch-server-mocks';
 import {
   defaultAssistantFeatures,
   PostEvaluateRequestBodyInput,
-  PostEvaluateRequestQueryInput,
 } from '@kbn/elastic-assistant-common';
+import type { AuthenticatedUser } from '@kbn/core-security-common';
 
 const defaultBody: PostEvaluateRequestBodyInput = {
-  dataset: undefined,
-  evalPrompt: undefined,
-};
-
-const defaultQueryParams: PostEvaluateRequestQueryInput = {
-  agents: 'agents',
-  datasetName: undefined,
-  evaluationType: undefined,
-  evalModel: undefined,
-  models: 'models',
-  outputIndex: '.kibana-elastic-ai-assistant-',
-  projectName: undefined,
+  datasetName: 'datasetName',
+  graphs: ['graphs'],
+  connectorIds: ['id1', 'id2'],
   runName: undefined,
+  langSmithApiKey: undefined,
 };
 
 describe('Post Evaluate Route', () => {
-  let server: ReturnType<typeof serverMock.create>;
-  let { context } = requestContextMock.createTools();
+  const { clients, context } = requestContextMock.createTools();
+  const server: ReturnType<typeof serverMock.create> = serverMock.create();
+  clients.core.elasticsearch.client = elasticsearchServiceMock.createScopedClusterClient();
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  (server as any).responseMock.notFound = jest.fn().mockReturnValue({
+    status: 404,
+    payload: 'Not Found',
+  });
+
+  const mockUser = {
+    username: 'my_username',
+    authentication_realm: {
+      type: 'my_realm_type',
+      name: 'my_realm_name',
+    },
+  } as AuthenticatedUser;
+
   const mockGetElser = jest.fn().mockResolvedValue('.elser_model_2');
 
   beforeEach(() => {
-    server = serverMock.create();
-    ({ context } = requestContextMock.createTools());
+    jest.clearAllMocks();
+    context.elasticAssistant.getCurrentUser.mockReturnValue(mockUser);
 
     postEvaluateRoute(server.router, mockGetElser);
   });
@@ -51,7 +59,7 @@ describe('Post Evaluate Route', () => {
       });
 
       const response = await server.inject(
-        getPostEvaluateRequest({ body: defaultBody, query: defaultQueryParams }),
+        getPostEvaluateRequest({ body: defaultBody }),
         requestContextMock.convertContext(context)
       );
       expect(response.status).toEqual(404);
diff --git a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
index 56e99491a6f9d..80130e1750c42 100644
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/post_evaluate.ts
@@ -5,42 +5,54 @@
  * 2.0.
  */
 
-import { type IKibanaResponse, IRouter, KibanaRequest } from '@kbn/core/server';
+import { IRouter, KibanaRequest, type IKibanaResponse } from '@kbn/core/server';
 import { transformError } from '@kbn/securitysolution-es-utils';
+import { asyncForEach } from '@kbn/std';
+import { Client } from 'langsmith';
+import { evaluate } from 'langsmith/evaluation';
 import { v4 as uuidv4 } from 'uuid';
 
+import { getRequestAbortedSignal } from '@kbn/data-plugin/server';
 import {
   API_VERSIONS,
+  ELASTIC_AI_ASSISTANT_EVALUATE_URL,
+  ExecuteConnectorRequestBody,
   INTERNAL_API_ACCESS,
   PostEvaluateBody,
-  PostEvaluateRequestQuery,
   PostEvaluateResponse,
-  ExecuteConnectorRequestBody,
 } from '@kbn/elastic-assistant-common';
-import { ActionsClientLlm } from '@kbn/langchain/server';
-import { getLangSmithTracer } from '@kbn/langchain/server/tracers/langsmith';
 import { buildRouteValidationWithZod } from '@kbn/elastic-assistant-common/impl/schemas/common';
-import { ESQL_RESOURCE, KNOWLEDGE_BASE_INDEX_PATTERN } from '../knowledge_base/constants';
-import { buildResponse } from '../../lib/build_response';
-import { ElasticAssistantRequestHandlerContext, GetElser } from '../../types';
-import { EVALUATE } from '../../../common/constants';
-import { performEvaluation } from '../../lib/model_evaluator/evaluation';
-import { AgentExecutorEvaluatorWithMetadata } from '../../lib/langchain/executors/types';
+import { getDefaultArguments } from '@kbn/langchain/server';
+import { StructuredTool } from '@langchain/core/tools';
 import {
-  indexEvaluations,
-  setupEvaluationIndex,
-} from '../../lib/model_evaluator/output_index/utils';
-import { fetchLangSmithDataset, getConnectorName } from './utils';
-import { DEFAULT_PLUGIN_NAME, getPluginNameFromRequest } from '../helpers';
-
-/**
- * To support additional Agent Executors from the UI, add them to this map
- * and reference your specific AgentExecutor function
- */
-import { AGENT_EXECUTOR_MAP } from '../../lib/langchain/executors';
+  createOpenAIFunctionsAgent,
+  createStructuredChatAgent,
+  createToolCallingAgent,
+} from 'langchain/agents';
+import { RetrievalQAChain } from 'langchain/chains';
+import { buildResponse } from '../../lib/build_response';
+import { AssistantDataClients } from '../../lib/langchain/executors/types';
+import { AssistantToolParams, ElasticAssistantRequestHandlerContext, GetElser } from '../../types';
+import { DEFAULT_PLUGIN_NAME, performChecks } from '../helpers';
+import { ESQL_RESOURCE } from '../knowledge_base/constants';
+import { fetchLangSmithDataset } from './utils';
+import { transformESSearchToAnonymizationFields } from '../../ai_assistant_data_clients/anonymization_fields/helpers';
+import { EsAnonymizationFieldsSchema } from '../../ai_assistant_data_clients/anonymization_fields/types';
 import { ElasticsearchStore } from '../../lib/langchain/elasticsearch_store/elasticsearch_store';
+import {
+  DefaultAssistantGraph,
+  getDefaultAssistantGraph,
+} from '../../lib/langchain/graphs/default_assistant_graph/graph';
+import {
+  bedrockToolCallingAgentPrompt,
+  geminiToolCallingAgentPrompt,
+  openAIFunctionAgentPrompt,
+  structuredChatAgentPrompt,
+} from '../../lib/langchain/graphs/default_assistant_graph/prompts';
+import { getLlmClass, getLlmType } from '../utils';
 
 const DEFAULT_SIZE = 20;
+const ROUTE_HANDLER_TIMEOUT = 10 * 60 * 1000; // 10 * 60 seconds = 10 minutes
 
 export const postEvaluateRoute = (
   router: IRouter<ElasticAssistantRequestHandlerContext>,
@@ -49,10 +61,12 @@ export const postEvaluateRoute = (
   router.versioned
     .post({
       access: INTERNAL_API_ACCESS,
-      path: EVALUATE,
-
+      path: ELASTIC_AI_ASSISTANT_EVALUATE_URL,
       options: {
         tags: ['access:elasticAssistant'],
+        timeout: {
+          idleSocket: ROUTE_HANDLER_TIMEOUT,
+        },
       },
     })
     .addVersion(
@@ -61,7 +75,6 @@ export const postEvaluateRoute = (
         validate: {
           request: {
             body: buildRouteValidationWithZod(PostEvaluateBody),
-            query: buildRouteValidationWithZod(PostEvaluateRequestQuery),
           },
           response: {
             200: {
@@ -71,108 +84,88 @@ export const postEvaluateRoute = (
         },
       },
       async (context, request, response): Promise<IKibanaResponse<PostEvaluateResponse>> => {
-        const assistantContext = await context.elasticAssistant;
-        const logger = assistantContext.logger;
+        const ctx = await context.resolve(['core', 'elasticAssistant', 'licensing']);
+        const assistantContext = ctx.elasticAssistant;
+        const actions = ctx.elasticAssistant.actions;
+        const logger = assistantContext.logger.get('evaluate');
         const telemetry = assistantContext.telemetry;
+        const abortSignal = getRequestAbortedSignal(request.events.aborted$);
 
-        // Validate evaluation feature is enabled
-        const pluginName = getPluginNameFromRequest({
+        // Perform license, authenticated user and evaluation FF checks
+        const checkResponse = performChecks({
+          authenticatedUser: true,
+          capability: 'assistantModelEvaluation',
+          context: ctx,
+          license: true,
           request,
-          defaultPluginName: DEFAULT_PLUGIN_NAME,
-          logger,
+          response,
         });
-        const registeredFeatures = assistantContext.getRegisteredFeatures(pluginName);
-        if (!registeredFeatures.assistantModelEvaluation) {
-          return response.notFound();
+        if (checkResponse) {
+          return checkResponse;
         }
 
         try {
           const evaluationId = uuidv4();
           const {
-            evalModel,
-            evaluationType,
-            outputIndex,
+            alertsIndexPattern,
             datasetName,
-            projectName = 'default',
+            graphs: graphNames,
+            langSmithApiKey,
+            connectorIds,
+            size,
+            replacements,
             runName = evaluationId,
-          } = request.query;
-          const { dataset: customDataset = [], evalPrompt } = request.body;
-          const connectorIds = request.query.models?.split(',') || [];
-          const agentNames = request.query.agents?.split(',') || [];
+          } = request.body;
 
-          const dataset =
-            datasetName != null ? await fetchLangSmithDataset(datasetName, logger) : customDataset;
+          const dataset = await fetchLangSmithDataset(datasetName, logger, langSmithApiKey);
+
+          if (dataset.length === 0) {
+            return response.badRequest({
+              body: { message: `No LangSmith dataset found for name: ${datasetName}` },
+            });
+          }
 
           logger.info('postEvaluateRoute:');
           logger.info(`request.query:\n${JSON.stringify(request.query, null, 2)}`);
           logger.info(`request.body:\n${JSON.stringify(request.body, null, 2)}`);
           logger.info(`Evaluation ID: ${evaluationId}`);
 
-          const totalExecutions = connectorIds.length * agentNames.length * dataset.length;
-          logger.info('Creating agents:');
+          const totalExecutions = connectorIds.length * graphNames.length * dataset.length;
+          logger.info('Creating graphs:');
           logger.info(`\tconnectors/models: ${connectorIds.length}`);
-          logger.info(`\tagents: ${agentNames.length}`);
+          logger.info(`\tgraphs: ${graphNames.length}`);
           logger.info(`\tdataset: ${dataset.length}`);
-          logger.warn(`\ttotal baseline agent executions: ${totalExecutions} `);
+          logger.warn(`\ttotal graph executions: ${totalExecutions} `);
           if (totalExecutions > 50) {
             logger.warn(
-              `Total baseline agent executions >= 50! This may take a while, and cost some money...`
+              `Total baseline graph executions >= 50! This may take a while, and cost some money...`
             );
           }
 
-          // Get the actions plugin start contract from the request context for the agents
-          const actions = (await context.elasticAssistant).actions;
-
-          // Fetch all connectors from the actions plugin, so we can set the appropriate `llmType` on ActionsClientLlm
-          const actionsClient = await actions.getActionsClientWithRequest(request);
-          const connectors = await actionsClient.getBulk({
-            ids: connectorIds,
-            throwIfSystemAction: false,
-          });
-
-          // Fetch any tools registered by the request's originating plugin
-          const assistantTools = (await context.elasticAssistant).getRegisteredTools(pluginName);
-
-          // Get a scoped esClient for passing to the agents for retrieval, and
-          // writing results to the output index
-          const esClient = (await context.core).elasticsearch.client.asCurrentUser;
+          // Setup graph params
+          // Get a scoped esClient for esStore + writing results to the output index
+          const esClient = ctx.core.elasticsearch.client.asCurrentUser;
 
           // Default ELSER model
           const elserId = await getElser();
 
-          // Skeleton request from route to pass to the agents
-          // params will be passed to the actions executor
-          const skeletonRequest: KibanaRequest<unknown, unknown, ExecuteConnectorRequestBody> = {
-            ...request,
-            body: {
-              alertsIndexPattern: '',
-              allow: [],
-              allowReplacement: [],
-              subAction: 'invokeAI',
-              // The actionTypeId is irrelevant when used with the invokeAI subaction
-              actionTypeId: '.gen-ai',
-              replacements: {},
-              size: DEFAULT_SIZE,
-              conversationId: '',
-            },
+          // Data clients
+          const anonymizationFieldsDataClient =
+            (await assistantContext.getAIAssistantAnonymizationFieldsDataClient()) ?? undefined;
+          const conversationsDataClient =
+            (await assistantContext.getAIAssistantConversationsDataClient()) ?? undefined;
+          const kbDataClient =
+            (await assistantContext.getAIAssistantKnowledgeBaseDataClient()) ?? undefined;
+          const dataClients: AssistantDataClients = {
+            anonymizationFieldsDataClient,
+            conversationsDataClient,
+            kbDataClient,
           };
 
-          // Create an ElasticsearchStore for KB interactions
-          // Setup with kbDataClient if `enableKnowledgeBaseByDefault` FF is enabled
-          const enableKnowledgeBaseByDefault =
-            assistantContext.getRegisteredFeatures(pluginName).assistantKnowledgeBaseByDefault;
-          const bedrockChatEnabled =
-            assistantContext.getRegisteredFeatures(pluginName).assistantBedrockChat;
-          const kbDataClient = enableKnowledgeBaseByDefault
-            ? (await assistantContext.getAIAssistantKnowledgeBaseDataClient()) ?? undefined
-            : undefined;
-          const kbIndex =
-            enableKnowledgeBaseByDefault && kbDataClient != null
-              ? kbDataClient.indexTemplateAndPattern.alias
-              : KNOWLEDGE_BASE_INDEX_PATTERN;
+          // esStore
           const esStore = new ElasticsearchStore(
             esClient,
-            kbIndex,
+            kbDataClient?.indexTemplateAndPattern?.alias ?? '',
             logger,
             telemetry,
             elserId,
@@ -180,89 +173,160 @@ export const postEvaluateRoute = (
             kbDataClient
           );
 
-          // Create an array of executor functions to call in batches
-          // One for each connector/model + agent combination
-          // Hoist `langChainMessages` so they can be batched by dataset.input in the evaluator
-          const agents: AgentExecutorEvaluatorWithMetadata[] = [];
-          connectorIds.forEach((connectorId) => {
-            agentNames.forEach((agentName) => {
-              logger.info(`Creating agent: ${connectorId} + ${agentName}`);
-              const connectorName =
-                getConnectorName(connectorId, connectors) ?? '[unknown connector]';
-              const detailedRunName = `${runName} - ${connectorName} + ${agentName}`;
-              agents.push({
-                agentEvaluator: async (langChainMessages, exampleId) => {
-                  const evalResult = await AGENT_EXECUTOR_MAP[agentName]({
-                    actionsClient,
-                    assistantTools,
-                    bedrockChatEnabled,
-                    connectorId,
-                    esClient,
-                    esStore,
-                    isStream: false,
-                    langChainMessages,
-                    llmType: 'openai',
-                    logger,
-                    request: skeletonRequest,
-                    traceOptions: {
-                      exampleId,
-                      projectName,
-                      runName: detailedRunName,
-                      evaluationId,
-                      tags: [
-                        'security-assistant-prediction',
-                        ...(connectorName != null ? [connectorName] : []),
-                        runName,
-                      ],
-                      tracers: getLangSmithTracer({
-                        projectName: detailedRunName,
-                        exampleId,
-                        logger,
-                      }),
-                    },
-                    replacements: {},
-                  });
-                  return evalResult.body;
-                },
-                metadata: {
-                  connectorName,
-                  runName: detailedRunName,
-                },
-              });
-            });
+          // Actions
+          const actionsClient = await actions.getActionsClientWithRequest(request);
+          const connectors = await actionsClient.getBulk({
+            ids: connectorIds,
+            throwIfSystemAction: false,
           });
-          logger.info(`Agents created: ${agents.length}`);
 
-          // Evaluator Model is optional to support just running predictions
-          const evaluatorModel =
-            evalModel == null || evalModel === ''
-              ? undefined
-              : new ActionsClientLlm({
+          // Fetch any tools registered to the security assistant
+          const assistantTools = assistantContext.getRegisteredTools(DEFAULT_PLUGIN_NAME);
+
+          const graphs: Array<{ name: string; graph: DefaultAssistantGraph }> = await Promise.all(
+            connectors.map(async (connector) => {
+              const llmType = getLlmType(connector.actionTypeId);
+              const isOpenAI = llmType === 'openai';
+              const llmClass = getLlmClass(llmType, true);
+              const createLlmInstance = () =>
+                new llmClass({
                   actionsClient,
-                  connectorId: evalModel,
+                  connectorId: connector.id,
+                  llmType,
                   logger,
-                  model: skeletonRequest.body.model,
+                  temperature: getDefaultArguments(llmType).temperature,
+                  signal: abortSignal,
+                  streaming: false,
+                  maxRetries: 0,
                 });
+              const llm = createLlmInstance();
+              const anonymizationFieldsRes =
+                await dataClients?.anonymizationFieldsDataClient?.findDocuments<EsAnonymizationFieldsSchema>(
+                  {
+                    perPage: 1000,
+                    page: 1,
+                  }
+                );
 
-          const { evaluationResults, evaluationSummary } = await performEvaluation({
-            agentExecutorEvaluators: agents,
-            dataset,
-            evaluationId,
-            evaluatorModel,
-            evaluationPrompt: evalPrompt,
-            evaluationType,
-            logger,
-            runName,
-          });
+              const anonymizationFields = anonymizationFieldsRes
+                ? transformESSearchToAnonymizationFields(anonymizationFieldsRes.data)
+                : undefined;
 
-          logger.info(`Writing evaluation results to index: ${outputIndex}`);
-          await setupEvaluationIndex({ esClient, index: outputIndex, logger });
-          await indexEvaluations({
-            esClient,
-            evaluationResults,
-            evaluationSummary,
-            index: outputIndex,
-            logger,
+              const modelExists = await esStore.isModelInstalled();
+
+              // Create a chain that uses the ELSER backed ElasticsearchStore, override k=10 for esql query generation for now
+              const chain = RetrievalQAChain.fromLLM(llm, esStore.asRetriever(10));
+
+              // Check if KB is available
+              const isEnabledKnowledgeBase =
+                (await dataClients.kbDataClient?.isModelDeployed()) ?? false;
+
+              // Skeleton request from route to pass to the agents
+              // params will be passed to the actions executor
+              const skeletonRequest: KibanaRequest<unknown, unknown, ExecuteConnectorRequestBody> =
+                {
+                  ...request,
+                  body: {
+                    alertsIndexPattern: '',
+                    allow: [],
+                    allowReplacement: [],
+                    subAction: 'invokeAI',
+                    // The actionTypeId is irrelevant when used with the invokeAI subaction
+                    actionTypeId: '.gen-ai',
+                    replacements: {},
+                    size: DEFAULT_SIZE,
+                    conversationId: '',
+                  },
+                };
+
+              // Fetch any applicable tools that the source plugin may have registered
+              const assistantToolParams: AssistantToolParams = {
+                anonymizationFields,
+                chain,
+                esClient,
+                isEnabledKnowledgeBase,
+                kbDataClient: dataClients?.kbDataClient,
+                llm,
+                logger,
+                modelExists,
+                request: skeletonRequest,
+                alertsIndexPattern,
+                // onNewReplacements,
+                replacements,
+                size,
+              };
+
+              const tools: StructuredTool[] = assistantTools.flatMap(
+                (tool) => tool.getTool(assistantToolParams) ?? []
+              );
+
+              const agentRunnable = isOpenAI
+                ? await createOpenAIFunctionsAgent({
+                    llm,
+                    tools,
+                    prompt: openAIFunctionAgentPrompt,
+                    streamRunnable: false,
+                  })
+                : llmType && ['bedrock', 'gemini'].includes(llmType)
+                ? createToolCallingAgent({
+                    llm,
+                    tools,
+                    prompt:
+                      llmType === 'bedrock'
+                        ? bedrockToolCallingAgentPrompt
+                        : geminiToolCallingAgentPrompt,
+                    streamRunnable: false,
+                  })
+                : await createStructuredChatAgent({
+                    llm,
+                    tools,
+                    prompt: structuredChatAgentPrompt,
+                    streamRunnable: false,
+                  });
+
+              return {
+                name: `${runName} - ${connector.name}`,
+                graph: getDefaultAssistantGraph({
+                  agentRunnable,
+                  conversationId: undefined,
+                  dataClients,
+                  createLlmInstance,
+                  logger,
+                  tools,
+                  responseLanguage: 'English',
+                  replacements: {},
+                  llmType,
+                  bedrockChatEnabled: true,
+                  isStreaming: false,
+                }),
+              };
+            })
+          );
+
+          // Run an evaluation for each graph so they show up separately (resulting in each dataset run grouped by connector)
+          await asyncForEach(graphs, async ({ name, graph }) => {
+            // Wrapper function for invoking the graph (to parse different input/output formats)
+            const predict = async (input: { input: string }) => {
+              logger.debug(`input:\n ${JSON.stringify(input, null, 2)}`);
+
+              const r = await graph.invoke(
+                { input: input.input }, // TODO: Update to use the correct input format per dataset type
+                {
+                  runName,
+                  tags: ['evaluation'],
+                }
+              );
+              const output = r.agentOutcome.returnValues.output;
+              return output;
+            };
+
+            const evalOutput = await evaluate(predict, {
+              data: datasetName ?? '',
+              evaluators: [], // Evals to be managed in LangSmith for now
+              experimentPrefix: name,
+              client: new Client({ apiKey: langSmithApiKey }),
+            });
+            logger.debug(`runResp:\n ${JSON.stringify(evalOutput, null, 2)}`);
           });
 
           return response.ok({
diff --git a/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts b/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
index 46909805510e2..34f009e266515 100644
--- a/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
+++ b/x-pack/plugins/elastic_assistant/server/routes/evaluate/utils.ts
@@ -5,59 +5,35 @@
  * 2.0.
  */
 
-import { Client } from 'langsmith';
-import type { ActionResult } from '@kbn/actions-plugin/server';
+import { Client, Example } from 'langsmith';
 import type { Logger } from '@kbn/core/server';
-import type { Run } from 'langsmith/schemas';
-import { ToolingLog } from '@kbn/tooling-log';
-import { Dataset } from '@kbn/elastic-assistant-common';
 import { isLangSmithEnabled } from '@kbn/langchain/server/tracers/langsmith';
 
 /**
- * Return connector name for the given connectorId/connectors
- *
- * @param connectorId
- * @param connectors
- */
-export const getConnectorName = (
-  connectorId: string,
-  connectors: ActionResult[]
-): string | undefined => {
-  return connectors.find((c) => c.id === connectorId)?.name;
-};
-
-/**
- * Fetches a dataset from LangSmith. Note that `client` will use env vars
+ * Fetches a dataset from LangSmith. Note that `client` will use env vars unless langSmithApiKey is specified
  *
  * @param datasetName
  * @param logger
+ * @param langSmithApiKey
  */
 export const fetchLangSmithDataset = async (
   datasetName: string | undefined,
-  logger: Logger
-): Promise<Dataset> => {
+  logger: Logger,
+  langSmithApiKey?: string
+): Promise<Example[]> => {
   if (datasetName === undefined || !isLangSmithEnabled()) {
     throw new Error('LangSmith dataset name not provided or LangSmith not enabled');
   }
 
   try {
-    const client = new Client();
+    const client = new Client({ apiKey: langSmithApiKey });
 
     const examples = [];
     for await (const example of client.listExamples({ datasetName })) {
       examples.push(example);
     }
 
-    // Convert to internal Dataset type -- TODO: add generic support for the different LangSmith test dataset formats
-    const dataset: Dataset = examples.map((example) => ({
-      id: example.id,
-      input: example.inputs.input as string,
-      reference: (example.outputs?.output as string) ?? '',
-      tags: [], // TODO: Consider adding tags from example data, e.g.: `datasetId:${example.dataset_id}`, `exampleName:${example.name}`
-      prediction: undefined,
-    }));
-
-    return dataset;
+    return examples;
   } catch (e) {
     logger.error(`Error fetching dataset from LangSmith: ${e.message}`);
     return [];
@@ -65,35 +41,28 @@ export const fetchLangSmithDataset = async (
 };
 
 /**
- * Write Feedback to LangSmith for a given Run
+ * Fetches all LangSmith datasets.  Note that `client` will use env vars unless langSmithApiKey is specified
  *
- * @param run
- * @param evaluationId
  * @param logger
+ * @param langSmithApiKey
  */
-export const writeLangSmithFeedback = async (
-  run: Run,
-  evaluationId: string,
-  logger: Logger | ToolingLog
-): Promise<string> => {
+export const fetchLangSmithDatasets = async ({
+  logger,
+  langSmithApiKey,
+}: {
+  logger: Logger;
+  langSmithApiKey?: string;
+}): Promise<string[]> => {
   try {
-    const client = new Client();
-    const feedback = {
-      score: run.feedback_stats?.score,
-      value: run.feedback_stats?.value,
-      correction: run.feedback_stats?.correction,
-      comment: run.feedback_stats?.comment,
-      sourceInfo: run.feedback_stats?.sourceInfo,
-      feedbackSourceType: run.feedback_stats?.feedbackSourceType,
-      sourceRunId: run.feedback_stats?.sourceRunId,
-      feedbackId: run.feedback_stats?.feedbackId,
-      eager: run.feedback_stats?.eager,
-    };
-    await client.createFeedback(run.id, evaluationId, feedback);
-    const runUrl = await client.getRunUrl({ run });
-    return runUrl;
+    const client = new Client({ apiKey: langSmithApiKey });
+    const datasets = [];
+    for await (const dataset of client.listDatasets()) {
+      datasets.push(dataset);
+    }
+
+    return datasets.map((d) => d.name).sort();
   } catch (e) {
-    logger.error(`Error writing feedback to LangSmith: ${e.message}`);
-    return '';
+    logger.error(`Error fetching datasets from LangSmith: ${e.message}`);
+    return [];
   }
 };
diff --git a/x-pack/plugins/elastic_assistant/tsconfig.json b/x-pack/plugins/elastic_assistant/tsconfig.json
index a2d0ec6c1d68a..da2f19445b299 100644
--- a/x-pack/plugins/elastic_assistant/tsconfig.json
+++ b/x-pack/plugins/elastic_assistant/tsconfig.json
@@ -46,6 +46,7 @@
     "@kbn/stack-connectors-plugin",
     "@kbn/security-plugin",
     "@kbn/apm-utils",
+    "@kbn/std",
   ],
   "exclude": [
     "target/**/*",
diff --git a/x-pack/plugins/translations/translations/fr-FR.json b/x-pack/plugins/translations/translations/fr-FR.json
index dd081f3bd2b28..c52ee4b30782b 100644
--- a/x-pack/plugins/translations/translations/fr-FR.json
+++ b/x-pack/plugins/translations/translations/fr-FR.json
@@ -15256,41 +15256,21 @@
     "xpack.elasticAssistant.assistant.settings.connectorTitle": "Connecteur",
     "xpack.elasticAssistant.assistant.settings.deleteButtonTitle": "Supprimer",
     "xpack.elasticAssistant.assistant.settings.editButtonTitle": "Modifier",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsDescription": "Sélectionnez les agents (algorithmes RAG) pour évaluer l'ensemble de données.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsLabel": "Agents",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlDescription": "URL pour l'application APM de Kibana. Utilisé pour établir un lien avec les traces APM pour les résultats de l'évaluation. La valeur par défaut est \"{defaultUrlPath}\".",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlLabel": "URL de l'APM",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsDescription": "Sélectionnez les modèles pour évaluer l'ensemble de données.",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsLabel": "Connecteurs / Modèles",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetDescription": "Ensemble de données personnalisées à évaluer. Tableau avec des objets aux propriétés \"input\" (entrée) et \"references\" (références).",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetLabel": "Personnalisé",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsDescription": "Évaluez la prédiction des résultats à l'aide d'un modèle (connecteur) et d'un critère d'évaluation spécifiques.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsTitle": "Évaluation (facultatif)",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptDescription": "Les variables de modèle `input`, `reference` et `prediction` d'un modèle d'invite donné.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "Invite d'évaluation",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "Type d'évaluation à effectuer, par exemple \"correctness\" \"esql-validator\" ou \"custom\".",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "Type d'évaluation",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "Ensemble de données",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactApmLinkText": "APM",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "Discover",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "Fermer cette boîte de dialogue annulera l'évaluation. Vous pouvez surveiller les logs de serveur Kibana pour visualiser la progression, et afficher les résultats dans {discover} / {apm}. Cela peut prendre plusieurs minutes pour les grands ensembles de données.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "Le modèle avec lequel effectuer l'évaluation finale.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelLabel": "Modèle évaluateur",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexDescription": "Index où inscrire les résultats. Doit commencer par \".kibana-elastic-ai-assistant-\".",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexLabel": "Index de sortie",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyDescription": "Clé API pour l'écriture de traces dans LangSmith. Stockez dans le stockage de session. Fermez la fenêtre pour effacer la session.",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyLabel": "Clé API LangSmith",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetDescription": "Le nom de l'ensemble de données hébergé sur LangSmith à évaluer.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetLabel": "LangSmith",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetPlaceholder": "Génération de requête ESQL",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectDescription": "Projet LangSmith dans lequel écrire les traces.",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectLabel": "Projet LangSmith",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.performEvaluationTitle": "Évaluation en cours...",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsDescription": "Choisissez les modèles (connecteurs) et les agents correspondants que l'ensemble de données doit utiliser.",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsTitle": "Prédictions",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectDescription": "Projet LangSmith vers lequel écrire les résultats.",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectLabel": "Projet",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectPlaceholder": "8.12 Tests",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsDescription": "Configurez les détails de l'exécution du test comme le projet, le nom de l'exécution, l'ensemble de données et l'index de sortie.",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsTitle": "Détails de l'exécution",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runNameDescription": "Nom de cette exécution du test spécifique.",
diff --git a/x-pack/plugins/translations/translations/ja-JP.json b/x-pack/plugins/translations/translations/ja-JP.json
index 800729b48217b..d212feaf16ec6 100644
--- a/x-pack/plugins/translations/translations/ja-JP.json
+++ b/x-pack/plugins/translations/translations/ja-JP.json
@@ -15242,41 +15242,21 @@
     "xpack.elasticAssistant.assistant.settings.connectorTitle": "コネクター",
     "xpack.elasticAssistant.assistant.settings.deleteButtonTitle": "削除",
     "xpack.elasticAssistant.assistant.settings.editButtonTitle": "編集",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsDescription": "データセットを評価するエージェント（RAG algos）を選択します。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsLabel": "エージェント",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlDescription": "Kibana APMアプリのURL。評価結果のAPMトレースにリンクするために使用されます。デフォルトは\"{defaultUrlPath}\"です。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlLabel": "APM URL",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsDescription": "データセットを評価するモデルを選択します。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsLabel": "コネクター/モデル",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetDescription": "評価するカスタムデータセット。\"input\"プロパティと\"references\"プロパティを含むオブジェクトの配列。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetLabel": "カスタム",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsDescription": "特定のモデル（コネクター）と評価基準を使用して予測結果を評価。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsTitle": "評価（任意）",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptDescription": "テンプレート変数`input`、`reference`、`prediction`を指定したプロンプトテンプレート。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "評価プロンプト",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "実行する評価のタイプ（例：\"correctness\" \"esql-validator\"、または\"custom\"）。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "評価タイプ",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "データセット",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactApmLinkText": "APM",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "Discover",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "このダイアログを閉じると、評価がキャンセルされます。Kibanaサーバーのログで進行状況を確認し、{discover} / {apm}に結果を表示できます。大規模なデータセットでは何分もかかる場合があります。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "最終評価を実行するモデル。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelLabel": "評価モデル",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexDescription": "結果を書き込むインデックス。\".kibana-elastic-ai-assistant-\"プレフィックスを付ける必要があります。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexLabel": "出力インデックス",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyDescription": "トレースをLangSmithに書き込むためのAPIキー。セッションストレージに保存されます。セッションを消去するには、タブを閉じてください。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyLabel": "LangSmith APIキー",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetDescription": "評価するLangSmithでホスティングされているデータセットの名前。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetLabel": "LangSmith",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetPlaceholder": "ESQLクエリー生成",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectDescription": "トレースを書き込むLangSmithプロジェクト。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectLabel": "LangSmithプロジェクト",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.performEvaluationTitle": "評価を実行...",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsDescription": "データセットが実行対象とするモデル（コネクター）と対応するエージェントを選択します。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsTitle": "予測",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectDescription": "結果を書き込むLangSmithプロジェクト。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectLabel": "プロジェクト",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectPlaceholder": "8.12テスト",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsDescription": "プロジェクト、実行名、データセット、出力インデックスなどのテスト実行の詳細を設定します。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsTitle": "実行詳細",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runNameDescription": "この特定のテスト実行の名前。",
diff --git a/x-pack/plugins/translations/translations/zh-CN.json b/x-pack/plugins/translations/translations/zh-CN.json
index ca9ffa4c17f4e..0c8effa03f31d 100644
--- a/x-pack/plugins/translations/translations/zh-CN.json
+++ b/x-pack/plugins/translations/translations/zh-CN.json
@@ -15267,41 +15267,21 @@
     "xpack.elasticAssistant.assistant.settings.connectorTitle": "连接器",
     "xpack.elasticAssistant.assistant.settings.deleteButtonTitle": "删除",
     "xpack.elasticAssistant.assistant.settings.editButtonTitle": "编辑",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsDescription": "选择要依据其评估数据集的代理（RAG 算法）。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.agentsLabel": "代理",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlDescription": "Kibana APM 应用的 URL。用于链接到 APM 跟踪以获取评估结果。默认为“{defaultUrlPath}”。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.apmUrlLabel": "APM URL",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsDescription": "选择要依据其评估数据集的模型。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.connectorsLabel": "连接器/模型",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetDescription": "要评估的定制数据集。具有“input”和“references”属性的对象数组。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.customDatasetLabel": "定制",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsDescription": "使用特定模型（连接器）和评估标准评估预测结果。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationDetailsTitle": "评估（可选）",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptDescription": "提示模板给定的 `input`、`reference` 和 `prediction` 模板变量。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationPromptLabel": "评估提示",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeDescription": "要执行的评估类型，如“正确性”、“esql 验证器”或“定制”。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluationTypeLabel": "评估类型",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorDatasetLabel": "数据集",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactApmLinkText": "APM",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactDiscoverLinkText": "Discover",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorFunFactText": "关闭此对话框将取消评估。您可以查看 Kibana 服务器日志以了解进度，并在 {discover} {apm} 中查看结果。大型数据集可能需要许多时间。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelDescription": "执行最终评估的模型。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorModelLabel": "评估器模型",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexDescription": "要向其中写入结果的索引。必须以“.kibana-elastic-ai-assistant-”为前缀。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.evaluatorOutputIndexLabel": "输出索引",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyDescription": "用于将跟踪写入到 LangSmith 的 API 密钥。已存储在会话存储中。关闭选项卡以清除会话。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithApiKeyLabel": "LangSmith API 密钥",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetDescription": "LangSmith 上托管的要评估的数据集的名称。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetLabel": "LangSmith",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langsmithDatasetPlaceholder": "ESQL 查询生成",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectDescription": "要向其中写入跟踪的 LangSmith 项目。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.langSmithProjectLabel": "LangSmith 项目",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.performEvaluationTitle": "执行评估……",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsDescription": "选择应根据其运行数据集的模型（连接器）和对应代理。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.predictionDetailsTitle": "预测",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectDescription": "要向其中写入结果的 LangSmith 项目。",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectLabel": "项目",
-    "xpack.elasticAssistant.assistant.settings.evaluationSettings.projectPlaceholder": "8.12 测试",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsDescription": "配置测试运行详情，如项目、运行名称、数据集和输出索引。",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runDetailsTitle": "运行详情",
     "xpack.elasticAssistant.assistant.settings.evaluationSettings.runNameDescription": "此特定测试运行的名称。",