Skip to content

Commit

Permalink
Add Support for new OpenAI model GPT-4o-mini + improve prompts (#200)
Browse files Browse the repository at this point in the history
* feat: add support for GPT-4o mini

* feat: use uid instead of label so it's easier for smaller models to distinguish
  • Loading branch information
mondaychen authored Jul 18, 2024
1 parent fceef1a commit 461d2e8
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 29 deletions.
3 changes: 3 additions & 0 deletions src/helpers/aiSdkUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export enum SupportedModels {
Gpt4VisionPreview = "gpt-4-vision-preview",
Gpt4Turbo = "gpt-4-turbo",
Gpt4O = "gpt-4o",
Gpt4OMini = "gpt-4o-mini",
Claude3Sonnet = "claude-3-sonnet-20240229",
Claude3Opus = "claude-3-opus-20240229",
Claude35Sonnet = "claude-3-5-sonnet-20240620",
Expand All @@ -28,6 +29,7 @@ export const DisplayName = {
[SupportedModels.Gpt4VisionPreview]: "GPT-4 Vision (Preview)",
[SupportedModels.Gpt4Turbo]: "GPT-4 Turbo",
[SupportedModels.Gpt4O]: "GPT-4o",
[SupportedModels.Gpt4OMini]: "GPT-4o Mini",
[SupportedModels.Claude3Sonnet]: "Claude 3 Sonnet",
[SupportedModels.Claude3Opus]: "Claude 3 Opus",
[SupportedModels.Claude35Sonnet]: "Claude 3.5 Sonnet",
Expand All @@ -38,6 +40,7 @@ export function hasVisionSupport(model: SupportedModels) {
model === SupportedModels.Gpt4VisionPreview ||
model === SupportedModels.Gpt4Turbo ||
model === SupportedModels.Gpt4O ||
model === SupportedModels.Gpt4OMini ||
model === SupportedModels.Claude3Sonnet ||
model === SupportedModels.Claude3Opus ||
model === SupportedModels.Claude35Sonnet
Expand Down
12 changes: 9 additions & 3 deletions src/helpers/dom-agent/determineNextAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,15 @@ Example 1:
}
Example 2:
{
thought: "I am typing 'fish food' into the search bar",
action: "setValue(123, 'fish food')"
}
Example 3:
{
thought: "I continue to scroll down to find the section",
action: "scroll("down")"
action: "scroll('down')"
}
Your response must always be in JSON format and must include "thought" and "action".
Expand Down Expand Up @@ -137,9 +143,9 @@ ${pageContents}`;
// make action compatible with vision agent
// TODO: refactor dom agent so we don't need this
function visionActionAdapter(action: ParsedResponseSuccess): Action {
const args = { ...action.parsedAction.args, label: "" };
const args = { ...action.parsedAction.args, uid: "" };
if ("elementId" in args) {
args.label = args.elementId;
args.uid = args.elementId;
}
return {
thought: action.thought,
Expand Down
21 changes: 6 additions & 15 deletions src/helpers/rpc/performAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,40 +156,31 @@ function createOperateTool(
window.open(action.args.url, "_blank");
break;
case "click": {
const success = await click(domActions, action.args.label);
const success = await click(domActions, action.args.uid);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
case "setValue": {
const success = await setValue(
domActions,
action.args.label,
action.args.uid,
action.args.value || "",
);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
case "setValueAndEnter": {
const success = await setValue(
domActions,
action.args.label,
action.args.uid,
(action.args.value || "") + "\n",
);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
Expand Down
4 changes: 2 additions & 2 deletions src/helpers/vision-agent/determineNextAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ You can use the following tools:
${allToolsDescriptions}
You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label.
You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding uid.
You will also be given previous actions that you have taken. If something does not work, try find an alternative solution. For example, instead of searching for a specific item that the user requested, perform a general search and apply filters, or simply browse the results page.
You will also be given additional information of annotations.
Expand All @@ -30,7 +30,7 @@ This is one example of expected response from you:
"action": {
"name": "click",
"args": {
"label": "123"
"uid": "123"
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/helpers/vision-agent/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ import { z } from "zod";
export const clickSchema = z.object({
name: z.literal("click"),
description: z
.literal("Click on an element with the label on the annotation.")
.literal("Click on an element with the uid on the annotation.")
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
}),
});

export const setValueSchema = z.object({
name: z.literal("setValue"),
description: z
.literal(
"Focus on and set the value of an input element with the label on the annotation.",
"Focus on and set the value of an input element with the uid on the annotation.",
)
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
value: z.string(),
}),
});
Expand All @@ -31,7 +31,7 @@ export const setValueAndEnterSchema = z.object({
)
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
value: z.string(),
}),
});
Expand Down
8 changes: 4 additions & 4 deletions src/pages/content/drawLabels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ const baseZIndex = 10000;

type LabelDataWithElement = {
element: Element;
label: string;
uid: string;
name: string;
tagName: string;
role?: string;
Expand All @@ -229,7 +229,7 @@ function getLabelData(
const uidString = uid.toString();

const item: LabelDataWithElement = {
label: uidString,
uid: uidString,
name,
tagName: elem.tagName,
element: elem,
Expand Down Expand Up @@ -310,8 +310,8 @@ export function addLabelsToDom(data: LabelDataWithElement[]) {
const wrapper = document.createElement("div");
wrapper.classList.add("_label_overlay_wrapper");
wrapper.popover = "manual";
data.forEach(({ element, label }, index) => {
drawLabel(wrapper, element, label, baseZIndex + data.length - index);
data.forEach(({ element, uid }, index) => {
drawLabel(wrapper, element, uid, baseZIndex + data.length - index);
});
// set wrapper's width and height to match body
wrapper.style.width = `${document.documentElement.scrollWidth}px`;
Expand Down

0 comments on commit 461d2e8

Please sign in to comment.