diff --git a/example/src/library_photo_to_website.ts b/example/src/library_photo_to_website.ts index 5250c57..baade5d 100644 --- a/example/src/library_photo_to_website.ts +++ b/example/src/library_photo_to_website.ts @@ -44,6 +44,7 @@ const outputPath = path.join(workingDir, 'library.html') const bookLibraryWorkflow = workflow({ members: [librarian, webmaster], description: ` + Analyze the photo of the library and list all the books in the library. Generate a website that lists all the books in the library. The photo of books in the library is in the "${imagePath}" file. diff --git a/packages/tools/src/vision.ts b/packages/tools/src/vision.ts index b8f1444..92680e7 100644 --- a/packages/tools/src/vision.ts +++ b/packages/tools/src/vision.ts @@ -13,7 +13,7 @@ const encodeImage = async (imagePath: string): Promise => { async function callOpenAI( provider: Provider, - analysis: string, + prompt: string, image_url: string, detail: 'low' | 'high' ) { @@ -22,11 +22,11 @@ async function callOpenAI( { role: 'user', content: [ + { type: 'image_url', image_url: { url: image_url, detail } }, { type: 'text', - text: `${analysis}. Use your built-in OCR capabilities.`, + text: `${prompt}. Use your built-in OCR capabilities.`, }, - { type: 'image_url', image_url: { url: image_url, detail } }, ], }, ], @@ -57,12 +57,13 @@ async function callOpenAI( } export const visionTool = tool({ - description: 'Tool for analyzing and OCR the pictures', + description: + 'Analyzes the pictures using LLM Multimodal model with image to text (OCR) capabilities.', parameters: z.object({ imagePathUrl: z.string().describe('Absolute path to image on disk or URL'), - analysis: z.string().describe(s` - Description of what to analyze and extract from the image, such as - text content, layout, font styles, and any specific data fields.' + prompt: z.string().describe(s` + This is a prompt for LLM Multimodal model - a detailed instruction of what to analyze and extract + from the image, such as: text content, layout, font styles, and any specific data fields. `), detail: z .enum(['low', 'high']) @@ -71,10 +72,10 @@ export const visionTool = tool({ ) .default('high'), }), - execute: async ({ imagePathUrl, detail, analysis }, { provider }) => { + execute: async ({ imagePathUrl, detail, prompt }, { provider }) => { const imageUrl = imagePathUrl.startsWith('http') ? imagePathUrl : await encodeImage(imagePathUrl) - return callOpenAI(provider, analysis, imageUrl, detail) + return callOpenAI(provider, prompt, imageUrl, detail) }, })