Skip to content

Commit

Permalink
Refactor Eval for better tracing and error reporting (#3)
Browse files Browse the repository at this point in the history
Co-authored-by: Chris Chestnut <[email protected]>
Co-authored-by: Cleo Schneider <[email protected]>
Co-authored-by: Samuel Bushi <[email protected]>
  • Loading branch information
4 people authored and pavelgj committed May 1, 2024
1 parent 5d9f521 commit c99754f
Show file tree
Hide file tree
Showing 11 changed files with 175 additions and 84 deletions.
31 changes: 14 additions & 17 deletions genkit-tools/src/commands/eval-flow-run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,25 +123,22 @@ export const evalFlowRun = new Command('eval:flow')
return;
}

const datasetToEval = await fetchDataSet(runner, flowName, states);
const evalDataset = await fetchDataSet(runner, flowName, states);

const scores: Record<string, any> = {};
await Promise.all(
filteredEvaluatorActions.map(async (action) => {
const name = evaluatorName(action);
logger.info(`Running evaluator '${name}'...`);
const response = await runner.runAction({
key: name,
input: {
dataset: datasetToEval,
auth: options.auth ? JSON.parse(options.auth) : undefined,
},
});
scores[name] = response.result;
})
);

const scoredResults = enrichResultsWithScoring(scores, datasetToEval);
for (const action of filteredEvaluatorActions) {
const name = evaluatorName(action);
logger.info(`Running evaluator '${name}'...`);
const response = await runner.runAction({
key: name,
input: {
dataset: evalDataset,
auth: options.auth ? JSON.parse(options.auth) : undefined,
},
});
scores[name] = response.result;
}
const scoredResults = enrichResultsWithScoring(scores, evalDataset);

if (options.output) {
logger.info(`Writing results to '${options.output}'...`);
Expand Down
32 changes: 15 additions & 17 deletions genkit-tools/src/commands/eval-run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import {
enrichResultsWithScoring,
getLocalFileEvalStore,
} from '../eval';
import { EvaluatorResponse } from '../types/evaluators';
import { EvalResponses } from '../types/evaluators';
import { confirmLlmUse, evaluatorName, isEvaluator } from '../utils/eval';
import { logger } from '../utils/logger';
import { runInRunnerThenStop } from '../utils/runner-utils';
Expand Down Expand Up @@ -52,7 +52,7 @@ export const evalRun = new Command('eval:run')
const evalStore = getLocalFileEvalStore();

logger.debug(`Loading data from '${dataset}'...`);
const datasetToEval: EvalInput[] = JSON.parse(
const evalDataset: EvalInput[] = JSON.parse(
(await readFile(dataset)).toString('utf-8')
).map((testCase: any) => ({
...testCase,
Expand Down Expand Up @@ -97,22 +97,20 @@ export const evalRun = new Command('eval:run')
return;
}

const scores: Record<string, EvaluatorResponse> = {};
await Promise.all(
filteredEvaluatorActions.map(async (action) => {
const name = evaluatorName(action);
logger.info(`Running evaluator '${name}'...`);
const response = await runner.runAction({
key: name,
input: {
dataset: datasetToEval,
},
});
scores[name] = response.result as EvaluatorResponse;
})
);
const scores: Record<string, EvalResponses> = {};
for (const action of filteredEvaluatorActions) {
const name = evaluatorName(action);
logger.info(`Running evaluator '${name}'...`);
const response = await runner.runAction({
key: name,
input: {
evalDataset,
},
});
scores[name] = response.result as EvalResponses;
}

const scoredResults = enrichResultsWithScoring(scores, datasetToEval);
const scoredResults = enrichResultsWithScoring(scores, evalDataset);

if (options.output) {
logger.info(`Writing results to '${options.output}'...`);
Expand Down
8 changes: 5 additions & 3 deletions genkit-tools/src/eval/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
*/

import { EvalInput, EvalMetric, EvalResult } from '../eval';
import { EvaluatorResponse } from '../types/evaluators';
import { EvalResponse, EvalResponses } from '../types/evaluators';

/**
* Combines EvalInput with the generated scores to create a storable EvalResult.
*/
export function enrichResultsWithScoring(
scores: Record<string, EvaluatorResponse>,
scores: Record<string, EvalResponses>,
evalDataset: EvalInput[]
): EvalResult[] {
const scoreMap: Record<string, EvalMetric[]> = {};
Object.keys(scores).forEach((evaluator) => {
const evaluatorResponse = scores[evaluator];
evaluatorResponse.forEach((scoredSample) => {
evaluatorResponse.forEach((scoredSample: EvalResponse) => {
if (!scoredSample.testCaseId) {
throw new Error('testCaseId expected to be present');
}
Expand All @@ -40,6 +40,8 @@ export function enrichResultsWithScoring(
score: score.score,
rationale: score.details?.reasoning,
error: score.error,
traceId: scoredSample.traceId,
spanId: scoredSample.spanId,
});
});
});
Expand Down
2 changes: 2 additions & 0 deletions genkit-tools/src/eval/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ export const EvalMetricSchema = z.object({
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
rationale: z.string().optional(),
error: z.string().optional(),
traceId: z.string().optional(),
spanId: z.string().optional(),
});
export type EvalMetric = z.infer<typeof EvalMetricSchema>;

Expand Down
18 changes: 10 additions & 8 deletions genkit-tools/src/types/evaluators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ export const ScoreSchema = z.object({
.optional(),
});

export const EvaluatorResponseSchema = z.array(
z.object({
sampleIndex: z.number(),
testCaseId: z.string().optional(),
evaluation: ScoreSchema,
})
);
export const EvalResponseSchema = z.object({
sampleIndex: z.number(),
testCaseId: z.string().optional(),
traceId: z.string().optional(),
spanId: z.string().optional(),
evaluation: ScoreSchema,
});
export type EvalResponse = z.infer<typeof EvalResponseSchema>;

export type EvaluatorResponse = z.infer<typeof EvaluatorResponseSchema>;
export const EvalResponsesSchema = z.array(EvalResponseSchema);
export type EvalResponses = z.infer<typeof EvalResponsesSchema>;
4 changes: 2 additions & 2 deletions genkit-tools/tests/eval/parser_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import {
EvalResult,
enrichResultsWithScoring,
} from '../../src/eval';
import { EvaluatorResponse } from '../../src/types/evaluators';
import { EvalResponses } from '../../src/types/evaluators';

describe('parser', () => {
const evalRunResults: EvalResult[] = [
Expand Down Expand Up @@ -50,7 +50,7 @@ describe('parser', () => {
},
];

const evaluatorOutput: Record<string, EvaluatorResponse> = {
const evaluatorOutput: Record<string, EvalResponses> = {
'/evaluator/ragas/faithfulness': [
{
testCaseId: 'case1',
Expand Down
101 changes: 81 additions & 20 deletions js/ai/src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,18 @@
*/

import { action, Action } from '@genkit-ai/core';
import { logger } from '@genkit-ai/core/logging';
import { lookupAction, registerAction } from '@genkit-ai/core/registry';
import { setCustomMetadataAttributes } from '@genkit-ai/core/tracing';
import {
runInNewSpan,
setCustomMetadataAttributes,
SPAN_TYPE_ATTR,
} from '@genkit-ai/core/tracing';
import * as z from 'zod';

export const ATTR_PREFIX = 'genkit';
export const SPAN_STATE_ATTR = ATTR_PREFIX + ':state';

export const BaseDataPointSchema = z.object({
input: z.unknown(),
output: z.unknown().optional(),
Expand All @@ -43,33 +51,36 @@ export const ScoreSchema = z.object({
export const EVALUATOR_METADATA_KEY_USES_LLM = 'evaluatorUsesLlm';

export type Score = z.infer<typeof ScoreSchema>;

export type BaseDataPoint = z.infer<typeof BaseDataPointSchema>;
export type Dataset<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
> = Array<z.infer<DataPoint>>;

export const EvaluatorResponseSchema = z.array(
z.object({
sampleIndex: z.number(),
testCaseId: z.string().optional(),
evaluation: ScoreSchema,
})
);
export const EvalResponseSchema = z.object({
sampleIndex: z.number().optional(),
testCaseId: z.string().optional(),
traceId: z.string().optional(),
spanId: z.string().optional(),
evaluation: ScoreSchema,
});
export type EvalResponse = z.infer<typeof EvalResponseSchema>;

export type EvaluatorResponse = z.infer<typeof EvaluatorResponseSchema>;
// TODO remove EvalResponses in favor of EvalResponse[]
export const EvalResponsesSchema = z.array(EvalResponseSchema);
export type EvalResponses = z.infer<typeof EvalResponsesSchema>;

type EvaluatorFn<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
> = (
input: Dataset<DataPoint>,
input: z.infer<DataPoint>,
evaluatorOptions?: z.infer<CustomOptions>
) => Promise<EvaluatorResponse>;
) => Promise<EvalResponse>;

export type EvaluatorAction<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
> = Action<typeof EvalRequestSchema, typeof EvaluatorResponseSchema> & {
> = Action<typeof EvalRequestSchema, typeof EvalResponsesSchema> & {
__dataPointType?: DataPoint;
__configSchema?: CustomOptions;
};
Expand All @@ -78,7 +89,7 @@ function withMetadata<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,
>(
evaluator: Action<typeof EvalRequestSchema, typeof EvaluatorResponseSchema>,
evaluator: Action<typeof EvalRequestSchema, typeof EvalResponsesSchema>,
dataPointType?: DataPoint,
configSchema?: CustomOptions
): EvaluatorAction<DataPoint, CustomOptions> {
Expand Down Expand Up @@ -120,18 +131,68 @@ export function defineEvaluator<
: z.array(BaseDataPointSchema),
options: options.configSchema ?? z.unknown(),
}),
outputSchema: EvaluatorResponseSchema,
outputSchema: EvalResponsesSchema,
metadata: metadata,
},
(i) => {
async (i) => {
setCustomMetadataAttributes({ subtype: 'evaluator' });
return runner(i.dataset, i.options);
let evalResponses: EvalResponses = [];
for (let index = 0; index < i.dataset.length; index++) {
const datapoint = i.dataset[index];
let spanId;
let traceId;
try {
await runInNewSpan(
{
metadata: {
name: `Test Case ${datapoint.testCaseId}`,
},
labels: {
[SPAN_TYPE_ATTR]: 'evaluator',
},
},
async (metadata, otSpan) => {
try {
spanId = otSpan.spanContext().spanId;
traceId = otSpan.spanContext().traceId;
metadata.input = datapoint.input;
const testCaseOutput = await runner(datapoint, i.options);
testCaseOutput.sampleIndex = index;
testCaseOutput.spanId = spanId;
testCaseOutput.traceId = traceId;
metadata.output = testCaseOutput;
evalResponses.push(testCaseOutput);
return testCaseOutput;
} catch (e) {
const err = {
sampleIndex: index,
spanId,
traceId,
testCaseId: datapoint.testCaseId,
evaluation: {
error: `Evaluation of test case ${datapoint.testCaseId} failed: \n${(e as Error).stack}`,
},
};
metadata.output = err;
evalResponses.push(err);
throw e;
}
}
);
} catch (e) {
logger.error(
`Evaluation of test case ${datapoint.testCaseId} failed: \n${(e as Error).stack}`
);
continue;
}
}
return evalResponses;
}
);
const ewm = withMetadata(
evaluator as any as Action<
typeof EvalRequestSchema,
typeof EvaluatorResponseSchema
typeof EvalResponsesSchema
>,
options.dataPointType,
options.configSchema
Expand All @@ -158,7 +219,7 @@ export async function evaluate<
evaluator: EvaluatorArgument<DataPoint, EvaluatorOptions>;
dataset: Dataset<DataPoint>;
options?: z.infer<EvaluatorOptions>;
}): Promise<EvaluatorResponse> {
}): Promise<EvalResponses> {
let evaluator: EvaluatorAction<DataPoint, EvaluatorOptions>;
if (typeof params.evaluator === 'string') {
evaluator = await lookupAction(`/evaluator/${params.evaluator}`);
Expand All @@ -176,7 +237,7 @@ export async function evaluate<
return (await evaluator({
dataset: params.dataset,
options: params.options,
})) as EvaluatorResponse;
})) as EvalResponses;
}

export const EvaluatorInfoSchema = z.object({
Expand Down
1 change: 1 addition & 0 deletions js/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"@opentelemetry/sdk-node": "^0.49.0",
"@opentelemetry/sdk-trace-base": "^1.22.0",
"ajv": "^8.12.0",
"async-mutex": "^0.5.0",
"express": "^4.19.2",
"express-openapi-validator": "^5.1.3",
"json-schema": "^0.4.0",
Expand Down
3 changes: 1 addition & 2 deletions js/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ export const GENKIT_CLIENT_HEADER = `genkit-node/${GENKIT_VERSION} gl-node/${pro

export * from './action.js';
export * from './config.js';
export { GenkitError } from './error.js';
export * from './flowTypes.js';
export * from './telemetryTypes.js';

export { GenkitError } from './error.js';
Loading

0 comments on commit c99754f

Please sign in to comment.