[Security Assistant] Adds support for LangGraph evaluations (elastic#…

…190574) ## Summary This PR updates the existing evaluation framework to support LangGraph. Since the evaluation code was the last reference to the old agent executors, we were able to finally remove those as well. The evaluation feature remains behind a feature flag, and can be enabled with the following configuration: ``` xpack.securitySolution.enableExperimental: - 'assistantModelEvaluation' ``` Once enabled, the `Evaluation` tab will become visible in settings: <p align="center"> <img width="800" src="https://github.com/user-attachments/assets/8a0b8691-73a3-43b7-996b-8cc408edd5ab" /> </p> Notes: * We no longer write evaluation results to a local ES index. We can still do this, but most the value comes from viewing the results in LangSmith, so I didn't re-plumb this functionality after switching over to the new LangSmith `evaluator` function. * Need to add back support for custom datasets if we find this useful. Currently only LangSmith datasets are supported. Ended up porting over the `GET datasets` API from elastic#181348 to make this more useful. the `GET evaluate` route now returns `datasets`, an array of dataset names from LangSmith. * Some additional fields still need to be ported over to the POST evaluation API, like `size` and `alertsIndexPattern`. Update: Ported to API, just need presence in UI. * `Project name` was removed from the eval UI as we no longer need to tag runs to a specific project with the new LangSmith `evaluator` since they automatically show up under the `Experiments` section. * The 'Evaluation (Optional)' section currently isn't used, so it's been removed. We can re-enable this when there is need to run local evals on predictions outside of LangSmith. To test, set a `Run name`, input a Dataset from LangSmith e.g. `LangGraph Eval Testing`, select a few connectors and the `DefaultAssistantGraph`, then click `Perform evaluation...`. Results will show up in LangSmith under `Datasets & Testing`. Note: It's easy to run into rate limiting errors with Gemini, so just keep aware of that when running larger datasets. The new LangSmith `evaluator` function has an option for `maxConcurrency` to control the maximum number of concurrent evaluations to run, so we can tweak that as needed.. Once complete, you can compare all results side-by-side in LangSmith :tada: <img width="2312" alt="image" src="https://github.com/user-attachments/assets/7ca31722-7400-4717-9735-d6c1c97b6e49"> --------- Co-authored-by: kibanamachine <[email protected]>
JiaweiWu · Aug 20, 2024 · c276638 · c276638
1 parent 77ad05e
commit c276638
Show file tree

Hide file tree

Showing 35 changed files with 518 additions and 2,430 deletions.
diff --git a/x-pack/packages/kbn-elastic-assistant-common/constants.ts b/x-pack/packages/kbn-elastic-assistant-common/constants.ts
@@ -50,3 +50,6 @@ export const ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL_FIND =
   `${ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL}/_find` as const;
 export const ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL_BULK_ACTION =
   `${ELASTIC_AI_ASSISTANT_KNOWLEDGE_BASE_ENTRIES_URL}/_bulk_action` as const;
+
+export const ELASTIC_AI_ASSISTANT_EVALUATE_URL =
+  `${ELASTIC_AI_ASSISTANT_INTERNAL_URL}/evaluate` as const;
diff --git a/...nt-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml b/...nt-common/impl/schemas/actions_connector/post_actions_connector_execute_route.schema.yaml
@@ -1,7 +1,7 @@
 openapi: 3.0.0
 info:
   title: Execute Connector API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/actions/connector/{connectorId}/_execute:
     post:
@@ -103,4 +103,3 @@ paths:
                     type: string
                   message:
                     type: string
-
diff --git a/...k/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts b/...k/packages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.gen.ts
@@ -18,5 +18,6 @@ import { z } from 'zod';
 
 export type GetEvaluateResponse = z.infer<typeof GetEvaluateResponse>;
 export const GetEvaluateResponse = z.object({
-  agentExecutors: z.array(z.string()),
+  datasets: z.array(z.string()),
+  graphs: z.array(z.string()),
 });
diff --git a/...kages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml b/...kages/kbn-elastic-assistant-common/impl/schemas/evaluation/get_evaluate_route.schema.yaml
@@ -1,32 +1,37 @@
 openapi: 3.0.0
 info:
   title: Get Evaluate API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/evaluate:
     get:
       x-codegen-enabled: true
       x-labels: [ess, serverless]
       operationId: GetEvaluate
-      description: Get relevant data for performing an evaluation like available sample data, agents, and evaluators
+      description: Get relevant data for performing an evaluation like available sample data, graphs, and evaluators
       summary: Get relevant data for performing an evaluation
       tags:
         - Evaluation API
       responses:
-        '200':
+        "200":
           description: Successful response
           content:
             application/json:
               schema:
                 type: object
                 properties:
-                  agentExecutors:
+                  datasets:
+                    type: array
+                    items:
+                      type: string
+                  graphs:
                     type: array
                     items:
                       type: string
                 required:
-                  - agentExecutors
-        '400':
+                  - datasets
+                  - graphs
+        "400":
           description: Generic Error
           content:
             application/json:

diff --git a/.../packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts b/.../packages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.gen.ts
@@ -16,63 +16,19 @@
 
 import { z } from 'zod';
 
-export type OutputIndex = z.infer<typeof OutputIndex>;
-export const OutputIndex = z.string().regex(/^.kibana-elastic-ai-assistant-/);
-
-export type DatasetItem = z.infer<typeof DatasetItem>;
-export const DatasetItem = z.object({
-  id: z.string().optional(),
-  input: z.string(),
-  prediction: z.string().optional(),
-  reference: z.string(),
-  tags: z.array(z.string()).optional(),
-});
-
-export type Dataset = z.infer<typeof Dataset>;
-export const Dataset = z.array(DatasetItem).default([]);
+import { Replacements } from '../conversations/common_attributes.gen';
 
 export type PostEvaluateBody = z.infer<typeof PostEvaluateBody>;
 export const PostEvaluateBody = z.object({
-  dataset: Dataset.optional(),
-  evalPrompt: z.string().optional(),
-});
-
-export type PostEvaluateRequestQuery = z.infer<typeof PostEvaluateRequestQuery>;
-export const PostEvaluateRequestQuery = z.object({
-  /**
-   * Agents parameter description
-   */
-  agents: z.string(),
-  /**
-   * Dataset Name parameter description
-   */
-  datasetName: z.string().optional(),
-  /**
-   * Evaluation Type parameter description
-   */
-  evaluationType: z.string().optional(),
-  /**
-   * Eval Model parameter description
-   */
-  evalModel: z.string().optional(),
-  /**
-   * Models parameter description
-   */
-  models: z.string(),
-  /**
-   * Output Index parameter description
-   */
-  outputIndex: OutputIndex,
-  /**
-   * Project Name parameter description
-   */
-  projectName: z.string().optional(),
-  /**
-   * Run Name parameter description
-   */
+  graphs: z.array(z.string()),
+  datasetName: z.string(),
+  connectorIds: z.array(z.string()),
   runName: z.string().optional(),
+  alertsIndexPattern: z.string().optional().default('.alerts-security.alerts-default'),
+  langSmithApiKey: z.string().optional(),
+  replacements: Replacements.optional().default({}),
+  size: z.number().optional().default(20),
 });
-export type PostEvaluateRequestQueryInput = z.input<typeof PostEvaluateRequestQuery>;
 
 export type PostEvaluateRequestBody = z.infer<typeof PostEvaluateRequestBody>;
 export const PostEvaluateRequestBody = PostEvaluateBody;

diff --git a/...ages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml b/...ages/kbn-elastic-assistant-common/impl/schemas/evaluation/post_evaluate_route.schema.yaml
@@ -1,14 +1,14 @@
 openapi: 3.0.0
 info:
   title: Post Evaluate API endpoint
-  version: '1'
+  version: "1"
 paths:
   /internal/elastic_assistant/evaluate:
     post:
       x-codegen-enabled: true
       x-labels: [ess, serverless]
       operationId: PostEvaluate
-      description: Perform an evaluation using sample data against a combination of Agents and Connectors
+      description: Perform an evaluation using sample data against a combination of Graphs and Connectors
       summary: Performs an evaluation of the Elastic Assistant
       tags:
         - Evaluation API
@@ -17,53 +17,9 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/PostEvaluateBody'
-      parameters:
-        - name: agents
-          in: query
-          description: Agents parameter description
-          required: true
-          schema:
-            type: string
-        - name: datasetName
-          in: query
-          description: Dataset Name parameter description
-          schema:
-            type: string
-        - name: evaluationType
-          in: query
-          description: Evaluation Type parameter description
-          schema:
-            type: string
-        - name: evalModel
-          in: query
-          description: Eval Model parameter description
-          schema:
-            type: string
-        - name: models
-          in: query
-          description: Models parameter description
-          required: true
-          schema:
-            type: string
-        - name: outputIndex
-          in: query
-          description: Output Index parameter description
-          required: true
-          schema:
-            $ref: '#/components/schemas/OutputIndex'
-        - name: projectName
-          in: query
-          description: Project Name parameter description
-          schema:
-            type: string
-        - name: runName
-          in: query
-          description: Run Name parameter description
-          schema:
-            type: string
+              $ref: "#/components/schemas/PostEvaluateBody"
       responses:
-        '200':
+        "200":
           description: Successful response
           content:
             application/json:
@@ -77,7 +33,7 @@ paths:
                 required:
                   - evaluationId
                   - success
-        '400':
+        "400":
           description: Generic Error
           content:
             application/json:
@@ -92,36 +48,33 @@ paths:
                     type: string
 components:
   schemas:
-    OutputIndex:
-      type: string
-      pattern: '^.kibana-elastic-ai-assistant-'
-    DatasetItem:
+    PostEvaluateBody:
       type: object
+      required:
+        - graphs
+        - datasetName
+        - connectorIds
       properties:
-        id:
-          type: string
-        input:
-          type: string
-        prediction:
-          type: string
-        reference:
+        graphs:
+          type: array
+          items:
+            type: string
+        datasetName:
           type: string
-        tags:
+        connectorIds:
           type: array
           items:
             type: string
-      required:
-        - input
-        - reference
-    Dataset:
-      type: array
-      items:
-        $ref: '#/components/schemas/DatasetItem'
-      default: []
-    PostEvaluateBody:
-      type: object
-      properties:
-        dataset:
-          $ref: '#/components/schemas/Dataset'
-        evalPrompt:
+        runName:
+          type: string
+        alertsIndexPattern:
+          type: string
+          default: ".alerts-security.alerts-default"
+        langSmithApiKey:
           type: string
+        replacements:
+          $ref: "../conversations/common_attributes.schema.yaml#/components/schemas/Replacements"
+          default: {}
+        size:
+          type: number
+          default: 20
diff --git a/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx b/x-pack/packages/kbn-elastic-assistant/impl/assistant/api/evaluate/evaluate.test.tsx
@@ -7,7 +7,7 @@
 
 import { postEvaluation } from './evaluate';
 import { HttpSetup } from '@kbn/core-http-browser';
-import { API_VERSIONS } from '@kbn/elastic-assistant-common';
+import { API_VERSIONS, PostEvaluateRequestBodyInput } from '@kbn/elastic-assistant-common';
 
 jest.mock('@kbn/core-http-browser');
 
@@ -16,39 +16,26 @@ const mockHttp = {
 } as unknown as HttpSetup;
 
 describe('postEvaluation', () => {
-  it('calls the knowledge base API when correct resource path', async () => {
+  const evalParams: PostEvaluateRequestBodyInput = {
+    graphs: ['not', 'alphabetical'],
+    datasetName: 'Test Dataset',
+    runName: 'Test Run Name',
+    connectorIds: ['not', 'alphabetical'],
+  };
+
+  it('calls the evaluate API when correct resource path', async () => {
     (mockHttp.post as jest.Mock).mockResolvedValue({ success: true });
+
     const testProps = {
       http: mockHttp,
-      evalParams: {
-        agents: ['not', 'alphabetical'],
-        dataset: '{}',
-        datasetName: 'Test Dataset',
-        projectName: 'Test Project Name',
-        runName: 'Test Run Name',
-        evalModel: ['not', 'alphabetical'],
-        evalPrompt: 'evalPrompt',
-        evaluationType: ['not', 'alphabetical'],
-        models: ['not', 'alphabetical'],
-        outputIndex: 'outputIndex',
-      },
+      evalParams,
     };
 
     await postEvaluation(testProps);
 
     expect(mockHttp.post).toHaveBeenCalledWith('/internal/elastic_assistant/evaluate', {
-      body: '{"dataset":{},"evalPrompt":"evalPrompt"}',
+      body: '{"graphs":["not","alphabetical"],"datasetName":"Test Dataset","runName":"Test Run Name","connectorIds":["not","alphabetical"]}',
       headers: { 'Content-Type': 'application/json' },
-      query: {
-        models: 'alphabetical,not',
-        agents: 'alphabetical,not',
-        datasetName: 'Test Dataset',
-        evaluationType: 'alphabetical,not',
-        evalModel: 'alphabetical,not',
-        outputIndex: 'outputIndex',
-        projectName: 'Test Project Name',
-        runName: 'Test Run Name',
-      },
       signal: undefined,
       version: API_VERSIONS.internal.v1,
     });
@@ -59,11 +46,12 @@ describe('postEvaluation', () => {
       throw new Error(error);
     });
 
-    const knowledgeBaseArgs = {
+    const evaluationArgs = {
       resource: 'a-resource',
       http: mockHttp,
+      evalParams,
     };
 
-    await expect(postEvaluation(knowledgeBaseArgs)).resolves.toThrowError('simulated error');
+    await expect(postEvaluation(evaluationArgs)).rejects.toThrowError('simulated error');
   });
 });