Added Question Answering quality metric for Sythetic Q&A generation

abhinavrau · Sep 1, 2024 · 3b1eef4 · 3b1eef4
1 parent d9dec29
commit 3b1eef4
Show file tree

Hide file tree

Showing 17 changed files with 488 additions and 156 deletions.
diff --git a/src/common.js b/src/common.js
@@ -1,3 +1,14 @@
+export function findIndexByColumnsNameIn2DArray(array2D, searchValue) {
+  for (let i = 0; i < array2D.length; i++) {
+    if (array2D[i][0] === searchValue) {
+      // Found the value in the first index, return the entire sub-array
+      return i;
+    }
+  }
+  // Value not found, return null or handle it as needed
+  return 0;
+}
+
 // Vertex AI Search Table Format
 
 export const vertex_ai_search_configValues = [
@@ -43,9 +54,8 @@ export const vertex_ai_search_testTableHeader = [
 
 export var summaryMatching_prompt =
   "You will get two answers to a question, you should determine if they are semantically similar or not. ";
-export var summaryMatching_examples =
-  " examples - answer_1: I was created by X. answer_2: X created me. output:same " +
-  "answer_1:There are 52 days in a year. answer_2: A year is fairly long. output:different ";
+export var summaryMatching_examples = `examples - answer_1: I was created by X. answer_2: X created me. output:same 
+  answer_1:There are 52 days in a year. answer_2: A year is fairly long. output:different `;
 
 // Synthetic Q&A  Table Format
 export const synth_q_and_a_configValues = [
@@ -55,27 +65,53 @@ export const synth_q_and_a_configValues = [
   ["Gemini Model ID", "gemini-1.5-flash-001"],
   [
     "System Instructions",
-    "You are an expert in reading call center policy and procedure documents." +
-      "Given the attached document, generate a question and answer that customers are likely to ask a call center agent." +
-      "The question should only be sourced from the provided the document.Do not use any other information other than the attached document. " +
-      "Explain your reasoning for the answer by quoting verbatim where in the document the answer is found. Return the results in JSON format.Example: " +
-      "{'question': 'Here is a question?', 'answer': 'Here is the answer', 'reasoning': 'Quote from document'}",
+    `Given the attached document, generate a question and an answer.The question should only be sourced from the provided the document. Do not use any other information other than the attached document. Explain your reasoning for the answer by quoting verbatim where in the document the answer is found. Return the results in JSON format.Example: {'question': 'Here is a question?', 'answer': 'Here is the answer', 'reasoning': 'Quote from document'}`,
   ],
-  ["Batch Size (1-10)", "4"], // BatchSize
-  ["Time between Batches in Seconds (1-10)", "2"],
+  [
+    "Prompt",
+    `You are an expert in reading call center policy and procedure documents. Generate question and answer a customer would ask from a Bank using the attached document.`,
+  ],
+  ["Generate Q & A Quality", "TRUE"],
+  [
+    "Q & A Quality Prompt",
+    `# Instruction
+You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
+We will provide you with the user prompt and an AI-generated responses.
+You should first read the user prompt carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided in the Evaluation section below.
+
+# Evaluation
+## Metric Definition
+You will be assessing question answering quality, which measures the overall quality of the answer to the question in user prompt. Pay special attention to length constraints, such as in X words or in Y sentences. The instruction for performing a question-answering task is provided in the user prompt. The response should not contain information that is not present in the context (if it is provided).
+
+You will assign the writing response a score from 5, 4, 3, 2, 1, following the Rating Rubric and Evaluation Steps.
+Give step-by-step explanations for your scoring, and only choose scores from 5, 4, 3, 2, 1.
+
+## Criteria Definition
+Instruction following: The response demonstrates a clear understanding of the question answering task instructions, satisfying all of the instruction's requirements.
+Groundedness: The response contains information included only in the context if the context is present in user prompt. The response does not reference any outside information.
+Completeness: The response completely answers the question with suffient detail.
+Fluent: The response is well-organized and easy to read.
+
+## Rating Rubric
+5: (Very good). The answer follows instructions, is grounded, complete, and fluent.
+4: (Good). The answer follows instructions, is grounded, complete, but is not very fluent.
+3: (Ok). The answer mostly follows instructions, is grounded, answers the question partially and is not very fluent.
+2: (Bad). The answer does not follow the instructions very well, is incomplete or not fully grounded.
+1: (Very bad). The answer does not follow the instructions, is wrong and not grounded.
+
+## Evaluation Steps
+STEP 1: Assess the response in aspects of instruction following, groundedness,completeness, and fluency according to the crtieria.
+STEP 2: Score based on the rubric.
+
+Return result in JSON format. example output: { 'rating': 2 , evaluation: 'reason'}`,
+  ],
+  ["Q & A Quality Model ID", "gemini-1.5-pro-001"],
+  ["Max Concurrent Requests (1-10)", "5"],
+  ["Request Interval in Seconds(1-10)", "1"],
 ];
 
 export const synth_q_and_a_TableHeader = [
-  [
-    "ID",
-    "GCS File URI",
-    "Mime Type",
-    "Generated Question",
-    "Expected Answer",
-    "Reasoning",
-    "Status",
-    "Response Time",
-  ],
+  ["ID", "GCS File URI", "Mime Type", "Generated Question", "Expected Answer", "Q & A Quality"],
 ];
 
 // Summarization  Table Format
@@ -108,6 +144,12 @@ export const summarization_TableHeader = [
 ];
 
 // Eval Maps
+export const mapQuestionAnsweringScore = new Map();
+mapQuestionAnsweringScore.set("1", "1-Very Bad");
+mapQuestionAnsweringScore.set("2", "2-Bad");
+mapQuestionAnsweringScore.set("3", "3-OK");
+mapQuestionAnsweringScore.set("4", "4-Good");
+mapQuestionAnsweringScore.set("5", "5-Very Good");
 
 export const mapSummaryQualityScore = new Map();
 mapSummaryQualityScore.set(1, "1-Very Bad");
@@ -184,4 +226,3 @@ export class ResourceNotFoundError extends Error {
     this.statusCode = 404; // Optional: HTTP status code for API errors
   }
 }
-
diff --git a/src/excel/create_tables.js b/src/excel/create_tables.js
@@ -0,0 +1,75 @@
+import { appendError, showStatus } from "../ui.js";
+
+export async function createConfigTable(
+  taskTitle,
+  configValuesArray,
+  tableRangeStart,
+  tableRangeEnd,
+) {
+  await Excel.run(async (context) => {
+    try {
+      const currentWorksheet = context.workbook.worksheets.getActiveWorksheet();
+      currentWorksheet.load("name");
+      await context.sync();
+      const worksheetName = currentWorksheet.name;
+
+      var range = currentWorksheet.getRange("A1");
+      range.values = [[taskTitle]];
+      range.format.font.bold = true;
+      range.format.fill.color = "yellow";
+      range.format.font.size = 16;
+
+      var configTable = currentWorksheet.tables.add(tableRangeStart, true /*hasHeaders*/);
+      configTable.name = `${worksheetName}.ConfigTable`;
+
+      configTable.getHeaderRowRange().values = [configValuesArray[0]];
+
+      configTable.rows.add(null, configValuesArray.slice(1));
+
+      currentWorksheet.getUsedRange().format.autofitColumns();
+      currentWorksheet.getUsedRange().format.autofitRows();
+      currentWorksheet.getRange(tableRangeEnd).format.wrapText = true; // wrap system instrcutions
+      currentWorksheet.getRange(tableRangeEnd).format.shrinkToFit = true; // shrinkToFit system instrcutions
+
+      await context.sync();
+    } catch (error) {
+      showStatus(`Exception when creating ${taskTitle} Config Table: ${error.message}`, true);
+      appendError(`Error creating ${taskTitle} Config Table:`, error);
+
+      return;
+    }
+  });
+}
+
+export async function createDataTable(
+  taskTitle,
+  tableHeaderArray,
+  tableRangeStart,
+  tableRangeEnd,
+) {
+  await Excel.run(async (context) => {
+    try {
+      const currentWorksheet = context.workbook.worksheets.getActiveWorksheet();
+      currentWorksheet.load("name");
+      await context.sync();
+      const worksheetName = currentWorksheet.name;
+
+      var velvetTable = currentWorksheet.tables.add(tableRangeStart, true /*hasHeaders*/);
+      velvetTable.name = `${worksheetName}.TestCasesTable`;
+
+      velvetTable.getHeaderRowRange().values = [tableHeaderArray[0]];
+
+      velvetTable.resize(tableRangeEnd);
+      currentWorksheet.getUsedRange().format.autofitColumns();
+      currentWorksheet.getUsedRange().format.autofitRows();
+      currentWorksheet.getUsedRange().format.wrapText = true;
+      currentWorksheet.getUsedRange().format.shrinkToFit = true;
+
+      await context.sync();
+    } catch (error) {
+      showStatus(`Exception when creating ${taskTitle}  DataTable: ${error.message}`, true);
+      appendError(`Error creating  ${taskTitle} Data Table:`, error);
+      return;
+    }
+  });
+}
diff --git a/src/excel/excel_common.js b/src/excel/excel_common.js
@@ -10,3 +10,4 @@ export function getColumn(table, columnName) {
     showStatus(`Exception when getting column: ${JSON.stringify(error)}`, true);
   }
 }
+
diff --git a/src/excel/excel_search_runner.js b/src/excel/excel_search_runner.js
@@ -46,8 +46,8 @@ export class ExcelSearchRunner extends TaskRunner {
           ignoreAdversarialQuery: valueColumn.values[12][0],
           ignoreNonSummarySeekingQuery: valueColumn.values[13][0],
           summaryMatchingAdditionalPrompt: valueColumn.values[14][0],
-          batchSize: valueColumn.values[15][0],
-          timeBetweenCallsInSec: valueColumn.values[16][0],
+          batchSize: parseInt(valueColumn.values[15][0]),
+          timeBetweenCallsInSec: parseInt(valueColumn.values[16][0]),
           accessToken: $("#access-token").val(),
           systemInstruction: "",
           responseMimeType: "text/plain",
@@ -146,7 +146,7 @@ export class ExcelSearchRunner extends TaskRunner {
 
   async cancelAllTasks() {
     this.throttled_process_summary.abort();
-     appendLog(`Cancel Requested for Search Tasks`);
+    appendLog(`Cancel Requested for Search Tasks`);
   }
 
   async processRow(response_json, context, config, rowNum) {

diff --git a/src/excel/excel_summarization_runner.js b/src/excel/excel_summarization_runner.js
@@ -41,8 +41,8 @@ export class SummarizationRunner extends TaskRunner {
           generateSummarizationVerbosity: valueColumn.values[7][0],
           generateGroundedness: valueColumn.values[8][0],
           generateFulfillment: valueColumn.values[9][0],
-          batchSize: valueColumn.values[10][0],
-          timeBetweenCallsInSec: valueColumn.values[11][0],
+          batchSize: parseInt(valueColumn.values[10][0]),
+          timeBetweenCallsInSec: parseInt(valueColumn.values[11][0]),
           accessToken: $("#access-token").val(),
 
           systemInstruction: "",
@@ -118,7 +118,7 @@ export class SummarizationRunner extends TaskRunner {
   async getResultFromVertexAI(rowNum, config) {
     const toSummarize = this.toSummarizeColumn.values;
     const full_prompt = config.prompt + " Text to summarize: " + toSummarize[rowNum][0];
-    return await callGeminiMultitModal(rowNum, full_prompt, null, null, config);
+    return await callGeminiMultitModal(rowNum, full_prompt, null, null, null, config.model, config);
   }
 
   async waitForTaskstoFinish() {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@ export function getColumn(table, columnName) {
		showStatus(`Exception when getting column: ${JSON.stringify(error)}`, true);
		}
		}