Add Support for new OpenAI model GPT-4o-mini + improve prompts (#200)

* feat: add support for GPT-4o mini * feat: use uid instead of label so it's easier for smaller models to distinguish
normal-computing · Jul 18, 2024 · 461d2e8 · 461d2e8
1 parent fceef1a
commit 461d2e8
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 29 deletions.
diff --git a/src/helpers/aiSdkUtils.ts b/src/helpers/aiSdkUtils.ts
@@ -10,6 +10,7 @@ export enum SupportedModels {
   Gpt4VisionPreview = "gpt-4-vision-preview",
   Gpt4Turbo = "gpt-4-turbo",
   Gpt4O = "gpt-4o",
+  Gpt4OMini = "gpt-4o-mini",
   Claude3Sonnet = "claude-3-sonnet-20240229",
   Claude3Opus = "claude-3-opus-20240229",
   Claude35Sonnet = "claude-3-5-sonnet-20240620",
@@ -28,6 +29,7 @@ export const DisplayName = {
   [SupportedModels.Gpt4VisionPreview]: "GPT-4 Vision (Preview)",
   [SupportedModels.Gpt4Turbo]: "GPT-4 Turbo",
   [SupportedModels.Gpt4O]: "GPT-4o",
+  [SupportedModels.Gpt4OMini]: "GPT-4o Mini",
   [SupportedModels.Claude3Sonnet]: "Claude 3 Sonnet",
   [SupportedModels.Claude3Opus]: "Claude 3 Opus",
   [SupportedModels.Claude35Sonnet]: "Claude 3.5 Sonnet",
@@ -38,6 +40,7 @@ export function hasVisionSupport(model: SupportedModels) {
     model === SupportedModels.Gpt4VisionPreview ||
     model === SupportedModels.Gpt4Turbo ||
     model === SupportedModels.Gpt4O ||
+    model === SupportedModels.Gpt4OMini ||
     model === SupportedModels.Claude3Sonnet ||
     model === SupportedModels.Claude3Opus ||
     model === SupportedModels.Claude35Sonnet

diff --git a/src/helpers/dom-agent/determineNextAction.ts b/src/helpers/dom-agent/determineNextAction.ts
@@ -35,9 +35,15 @@ Example 1:
 }
 
 Example 2:
+{
+  thought: "I am typing 'fish food' into the search bar",
+  action: "setValue(123, 'fish food')"
+}
+
+Example 3:
 {
   thought: "I continue to scroll down to find the section",
-  action: "scroll("down")"
+  action: "scroll('down')"
 }
 
 Your response must always be in JSON format and must include "thought" and "action".
@@ -137,9 +143,9 @@ ${pageContents}`;
 // make action compatible with vision agent
 // TODO: refactor dom agent so we don't need this
 function visionActionAdapter(action: ParsedResponseSuccess): Action {
-  const args = { ...action.parsedAction.args, label: "" };
+  const args = { ...action.parsedAction.args, uid: "" };
   if ("elementId" in args) {
-    args.label = args.elementId;
+    args.uid = args.elementId;
   }
   return {
     thought: action.thought,

diff --git a/src/helpers/rpc/performAction.ts b/src/helpers/rpc/performAction.ts
@@ -156,40 +156,31 @@ function createOperateTool(
         window.open(action.args.url, "_blank");
         break;
       case "click": {
-        const success = await click(domActions, action.args.label);
+        const success = await click(domActions, action.args.uid);
         if (!success) {
-          console.error(
-            "Unable to find element with label: ",
-            action.args.label,
-          );
+          console.error("Unable to find element with uid: ", action.args.uid);
         }
         break;
       }
       case "setValue": {
         const success = await setValue(
           domActions,
-          action.args.label,
+          action.args.uid,
           action.args.value || "",
         );
         if (!success) {
-          console.error(
-            "Unable to find element with label: ",
-            action.args.label,
-          );
+          console.error("Unable to find element with uid: ", action.args.uid);
         }
         break;
       }
       case "setValueAndEnter": {
         const success = await setValue(
           domActions,
-          action.args.label,
+          action.args.uid,
           (action.args.value || "") + "\n",
         );
         if (!success) {
-          console.error(
-            "Unable to find element with label: ",
-            action.args.label,
-          );
+          console.error("Unable to find element with uid: ", action.args.uid);
         }
         break;
       }

diff --git a/src/helpers/vision-agent/determineNextAction.ts b/src/helpers/vision-agent/determineNextAction.ts
@@ -14,7 +14,7 @@ You can use the following tools:
 
 ${allToolsDescriptions}
 
-You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label.
+You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding uid.
 You will also be given previous actions that you have taken. If something does not work, try find an alternative solution. For example, instead of searching for a specific item that the user requested, perform a general search and apply filters, or simply browse the results page.
 You will also be given additional information of annotations.
 
@@ -30,7 +30,7 @@ This is one example of expected response from you:
   "action": {
     "name": "click",
     "args": {
-      "label": "123"
+      "uid": "123"
     }
   }
 }

diff --git a/src/helpers/vision-agent/tools.ts b/src/helpers/vision-agent/tools.ts
@@ -3,22 +3,22 @@ import { z } from "zod";
 export const clickSchema = z.object({
   name: z.literal("click"),
   description: z
-    .literal("Click on an element with the label on the annotation.")
+    .literal("Click on an element with the uid on the annotation.")
     .optional(),
   args: z.object({
-    label: z.string(),
+    uid: z.string(),
   }),
 });
 
 export const setValueSchema = z.object({
   name: z.literal("setValue"),
   description: z
     .literal(
-      "Focus on and set the value of an input element with the label on the annotation.",
+      "Focus on and set the value of an input element with the uid on the annotation.",
     )
     .optional(),
   args: z.object({
-    label: z.string(),
+    uid: z.string(),
     value: z.string(),
   }),
 });
@@ -31,7 +31,7 @@ export const setValueAndEnterSchema = z.object({
     )
     .optional(),
   args: z.object({
-    label: z.string(),
+    uid: z.string(),
     value: z.string(),
   }),
 });

diff --git a/src/pages/content/drawLabels.ts b/src/pages/content/drawLabels.ts
@@ -202,7 +202,7 @@ const baseZIndex = 10000;
 
 type LabelDataWithElement = {
   element: Element;
-  label: string;
+  uid: string;
   name: string;
   tagName: string;
   role?: string;
@@ -229,7 +229,7 @@ function getLabelData(
     const uidString = uid.toString();
 
     const item: LabelDataWithElement = {
-      label: uidString,
+      uid: uidString,
       name,
       tagName: elem.tagName,
       element: elem,
@@ -310,8 +310,8 @@ export function addLabelsToDom(data: LabelDataWithElement[]) {
   const wrapper = document.createElement("div");
   wrapper.classList.add("_label_overlay_wrapper");
   wrapper.popover = "manual";
-  data.forEach(({ element, label }, index) => {
-    drawLabel(wrapper, element, label, baseZIndex + data.length - index);
+  data.forEach(({ element, uid }, index) => {
+    drawLabel(wrapper, element, uid, baseZIndex + data.length - index);
   });
   // set wrapper's width and height to match body
   wrapper.style.width = `${document.documentElement.scrollWidth}px`;