Skip to content

Commit

Permalink
Added Question Answering quality metric for Sythetic Q&A generation
Browse files Browse the repository at this point in the history
  • Loading branch information
Abhinav Rau committed Sep 1, 2024
1 parent d9dec29 commit 3b1eef4
Show file tree
Hide file tree
Showing 17 changed files with 488 additions and 156 deletions.
83 changes: 62 additions & 21 deletions src/common.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
export function findIndexByColumnsNameIn2DArray(array2D, searchValue) {
for (let i = 0; i < array2D.length; i++) {
if (array2D[i][0] === searchValue) {
// Found the value in the first index, return the entire sub-array
return i;
}
}
// Value not found, return null or handle it as needed
return 0;
}

// Vertex AI Search Table Format

export const vertex_ai_search_configValues = [
Expand Down Expand Up @@ -43,9 +54,8 @@ export const vertex_ai_search_testTableHeader = [

export var summaryMatching_prompt =
"You will get two answers to a question, you should determine if they are semantically similar or not. ";
export var summaryMatching_examples =
" examples - answer_1: I was created by X. answer_2: X created me. output:same " +
"answer_1:There are 52 days in a year. answer_2: A year is fairly long. output:different ";
export var summaryMatching_examples = `examples - answer_1: I was created by X. answer_2: X created me. output:same
answer_1:There are 52 days in a year. answer_2: A year is fairly long. output:different `;

// Synthetic Q&A Table Format
export const synth_q_and_a_configValues = [
Expand All @@ -55,27 +65,53 @@ export const synth_q_and_a_configValues = [
["Gemini Model ID", "gemini-1.5-flash-001"],
[
"System Instructions",
"You are an expert in reading call center policy and procedure documents." +
"Given the attached document, generate a question and answer that customers are likely to ask a call center agent." +
"The question should only be sourced from the provided the document.Do not use any other information other than the attached document. " +
"Explain your reasoning for the answer by quoting verbatim where in the document the answer is found. Return the results in JSON format.Example: " +
"{'question': 'Here is a question?', 'answer': 'Here is the answer', 'reasoning': 'Quote from document'}",
`Given the attached document, generate a question and an answer.The question should only be sourced from the provided the document. Do not use any other information other than the attached document. Explain your reasoning for the answer by quoting verbatim where in the document the answer is found. Return the results in JSON format.Example: {'question': 'Here is a question?', 'answer': 'Here is the answer', 'reasoning': 'Quote from document'}`,
],
["Batch Size (1-10)", "4"], // BatchSize
["Time between Batches in Seconds (1-10)", "2"],
[
"Prompt",
`You are an expert in reading call center policy and procedure documents. Generate question and answer a customer would ask from a Bank using the attached document.`,
],
["Generate Q & A Quality", "TRUE"],
[
"Q & A Quality Prompt",
`# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
We will provide you with the user prompt and an AI-generated responses.
You should first read the user prompt carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided in the Evaluation section below.
# Evaluation
## Metric Definition
You will be assessing question answering quality, which measures the overall quality of the answer to the question in user prompt. Pay special attention to length constraints, such as in X words or in Y sentences. The instruction for performing a question-answering task is provided in the user prompt. The response should not contain information that is not present in the context (if it is provided).
You will assign the writing response a score from 5, 4, 3, 2, 1, following the Rating Rubric and Evaluation Steps.
Give step-by-step explanations for your scoring, and only choose scores from 5, 4, 3, 2, 1.
## Criteria Definition
Instruction following: The response demonstrates a clear understanding of the question answering task instructions, satisfying all of the instruction's requirements.
Groundedness: The response contains information included only in the context if the context is present in user prompt. The response does not reference any outside information.
Completeness: The response completely answers the question with suffient detail.
Fluent: The response is well-organized and easy to read.
## Rating Rubric
5: (Very good). The answer follows instructions, is grounded, complete, and fluent.
4: (Good). The answer follows instructions, is grounded, complete, but is not very fluent.
3: (Ok). The answer mostly follows instructions, is grounded, answers the question partially and is not very fluent.
2: (Bad). The answer does not follow the instructions very well, is incomplete or not fully grounded.
1: (Very bad). The answer does not follow the instructions, is wrong and not grounded.
## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness,completeness, and fluency according to the crtieria.
STEP 2: Score based on the rubric.
Return result in JSON format. example output: { 'rating': 2 , evaluation: 'reason'}`,
],
["Q & A Quality Model ID", "gemini-1.5-pro-001"],
["Max Concurrent Requests (1-10)", "5"],
["Request Interval in Seconds(1-10)", "1"],
];

export const synth_q_and_a_TableHeader = [
[
"ID",
"GCS File URI",
"Mime Type",
"Generated Question",
"Expected Answer",
"Reasoning",
"Status",
"Response Time",
],
["ID", "GCS File URI", "Mime Type", "Generated Question", "Expected Answer", "Q & A Quality"],
];

// Summarization Table Format
Expand Down Expand Up @@ -108,6 +144,12 @@ export const summarization_TableHeader = [
];

// Eval Maps
export const mapQuestionAnsweringScore = new Map();
mapQuestionAnsweringScore.set("1", "1-Very Bad");
mapQuestionAnsweringScore.set("2", "2-Bad");
mapQuestionAnsweringScore.set("3", "3-OK");
mapQuestionAnsweringScore.set("4", "4-Good");
mapQuestionAnsweringScore.set("5", "5-Very Good");

export const mapSummaryQualityScore = new Map();
mapSummaryQualityScore.set(1, "1-Very Bad");
Expand Down Expand Up @@ -184,4 +226,3 @@ export class ResourceNotFoundError extends Error {
this.statusCode = 404; // Optional: HTTP status code for API errors
}
}

75 changes: 75 additions & 0 deletions src/excel/create_tables.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { appendError, showStatus } from "../ui.js";

export async function createConfigTable(
taskTitle,
configValuesArray,
tableRangeStart,
tableRangeEnd,
) {
await Excel.run(async (context) => {
try {
const currentWorksheet = context.workbook.worksheets.getActiveWorksheet();
currentWorksheet.load("name");
await context.sync();
const worksheetName = currentWorksheet.name;

var range = currentWorksheet.getRange("A1");
range.values = [[taskTitle]];
range.format.font.bold = true;
range.format.fill.color = "yellow";
range.format.font.size = 16;

var configTable = currentWorksheet.tables.add(tableRangeStart, true /*hasHeaders*/);
configTable.name = `${worksheetName}.ConfigTable`;

configTable.getHeaderRowRange().values = [configValuesArray[0]];

configTable.rows.add(null, configValuesArray.slice(1));

currentWorksheet.getUsedRange().format.autofitColumns();
currentWorksheet.getUsedRange().format.autofitRows();
currentWorksheet.getRange(tableRangeEnd).format.wrapText = true; // wrap system instrcutions
currentWorksheet.getRange(tableRangeEnd).format.shrinkToFit = true; // shrinkToFit system instrcutions

await context.sync();
} catch (error) {
showStatus(`Exception when creating ${taskTitle} Config Table: ${error.message}`, true);
appendError(`Error creating ${taskTitle} Config Table:`, error);

return;
}
});
}

export async function createDataTable(
taskTitle,
tableHeaderArray,
tableRangeStart,
tableRangeEnd,
) {
await Excel.run(async (context) => {
try {
const currentWorksheet = context.workbook.worksheets.getActiveWorksheet();
currentWorksheet.load("name");
await context.sync();
const worksheetName = currentWorksheet.name;

var velvetTable = currentWorksheet.tables.add(tableRangeStart, true /*hasHeaders*/);
velvetTable.name = `${worksheetName}.TestCasesTable`;

velvetTable.getHeaderRowRange().values = [tableHeaderArray[0]];

velvetTable.resize(tableRangeEnd);
currentWorksheet.getUsedRange().format.autofitColumns();
currentWorksheet.getUsedRange().format.autofitRows();
currentWorksheet.getUsedRange().format.wrapText = true;
currentWorksheet.getUsedRange().format.shrinkToFit = true;

await context.sync();
} catch (error) {
showStatus(`Exception when creating ${taskTitle} DataTable: ${error.message}`, true);
appendError(`Error creating ${taskTitle} Data Table:`, error);
return;
}
});
}
1 change: 1 addition & 0 deletions src/excel/excel_common.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ export function getColumn(table, columnName) {
showStatus(`Exception when getting column: ${JSON.stringify(error)}`, true);
}
}

6 changes: 3 additions & 3 deletions src/excel/excel_search_runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ export class ExcelSearchRunner extends TaskRunner {
ignoreAdversarialQuery: valueColumn.values[12][0],
ignoreNonSummarySeekingQuery: valueColumn.values[13][0],
summaryMatchingAdditionalPrompt: valueColumn.values[14][0],
batchSize: valueColumn.values[15][0],
timeBetweenCallsInSec: valueColumn.values[16][0],
batchSize: parseInt(valueColumn.values[15][0]),
timeBetweenCallsInSec: parseInt(valueColumn.values[16][0]),
accessToken: $("#access-token").val(),
systemInstruction: "",
responseMimeType: "text/plain",
Expand Down Expand Up @@ -146,7 +146,7 @@ export class ExcelSearchRunner extends TaskRunner {

async cancelAllTasks() {
this.throttled_process_summary.abort();
appendLog(`Cancel Requested for Search Tasks`);
appendLog(`Cancel Requested for Search Tasks`);
}

async processRow(response_json, context, config, rowNum) {
Expand Down
6 changes: 3 additions & 3 deletions src/excel/excel_summarization_runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ export class SummarizationRunner extends TaskRunner {
generateSummarizationVerbosity: valueColumn.values[7][0],
generateGroundedness: valueColumn.values[8][0],
generateFulfillment: valueColumn.values[9][0],
batchSize: valueColumn.values[10][0],
timeBetweenCallsInSec: valueColumn.values[11][0],
batchSize: parseInt(valueColumn.values[10][0]),
timeBetweenCallsInSec: parseInt(valueColumn.values[11][0]),
accessToken: $("#access-token").val(),

systemInstruction: "",
Expand Down Expand Up @@ -118,7 +118,7 @@ export class SummarizationRunner extends TaskRunner {
async getResultFromVertexAI(rowNum, config) {
const toSummarize = this.toSummarizeColumn.values;
const full_prompt = config.prompt + " Text to summarize: " + toSummarize[rowNum][0];
return await callGeminiMultitModal(rowNum, full_prompt, null, null, config);
return await callGeminiMultitModal(rowNum, full_prompt, null, null, null, config.model, config);
}

async waitForTaskstoFinish() {
Expand Down
Loading

0 comments on commit 3b1eef4

Please sign in to comment.