diff --git a/.changeset/smart-ducks-fold.md b/.changeset/smart-ducks-fold.md new file mode 100644 index 000000000000..2764bd255dce --- /dev/null +++ b/.changeset/smart-ducks-fold.md @@ -0,0 +1,5 @@ +--- +'ai': patch +--- + +feat (core): support https and data url strings in image parts diff --git a/content/docs/03-ai-sdk-core/03-prompts.mdx b/content/docs/03-ai-sdk-core/03-prompts.mdx index 6e062001fd6d..17ab07649724 100644 --- a/content/docs/03-ai-sdk-core/03-prompts.mdx +++ b/content/docs/03-ai-sdk-core/03-prompts.mdx @@ -76,15 +76,27 @@ const result = await generateText({ Instead of sending a text in the `content` property, you can send an array of parts that include text and other data types. Currently image and text parts are supported. -For models that support multi-modal inputs, user messages can include images. An `image` can be a base64-encoded image (`string`), an `ArrayBuffer`, a `Uint8Array`, -a `Buffer`, or a `URL` object. It is possible to mix text and multiple images. +For models that support multi-modal inputs, user messages can include images. An `image` can be one of the following: + +- base64-encoded image: + - `string` with base-64 encoded content + - data URL `string`, e.g. `data:image/png;base64,...` +- binary image: + - `ArrayBuffer` + - `Uint8Array` + - `Buffer` +- URL: + - http(s) URL `string`, e.g. `https://example.com/image.png` + - `URL` object, e.g. `new URL('https://example.com/image.png')` + +It is possible to mix text and multiple images. Not all models support all types of multi-modal inputs. Check the model's capabilities before using this feature. -#### Example: Buffer images +#### Example: Binary image (Buffer) ```ts highlight="8-11" const result = await generateText({ @@ -104,9 +116,7 @@ const result = await generateText({ }); ``` -#### Example: Base-64 encoded images - -You do not need a `data:...` prefix for the base64-encoded image. +#### Example: Base-64 encoded image (string) ```ts highlight="8-11" const result = await generateText({ @@ -126,9 +136,9 @@ const result = await generateText({ }); ``` -#### Example: Image URLs +#### Example: Image URL (string) -```ts highlight="8-13" +```ts highlight="8-12" const result = await generateText({ model: yourModel, messages: [ @@ -138,9 +148,8 @@ const result = await generateText({ { type: 'text', text: 'Describe the image in detail.' }, { type: 'image', - image: new URL( + image: 'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true', - ), }, ], }, diff --git a/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx b/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx index 3595827a7125..65f23de0a6b2 100644 --- a/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx +++ b/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx @@ -107,7 +107,7 @@ console.log(text); name: 'image', type: 'string | Uint8Array | Buffer | ArrayBuffer | URL', description: - 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object', + 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.', }, ], }, diff --git a/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx b/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx index eb08ac104522..bbd2abb76c36 100644 --- a/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx +++ b/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx @@ -109,7 +109,7 @@ for await (const textPart of textStream) { name: 'image', type: 'string | Uint8Array | Buffer | ArrayBuffer | URL', description: - 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object', + 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.', }, ], }, diff --git a/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx b/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx index 08843c1b0f60..3659fff13b99 100644 --- a/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx +++ b/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx @@ -126,7 +126,7 @@ console.log(JSON.stringify(object, null, 2)); name: 'image', type: 'string | Uint8Array | Buffer | ArrayBuffer | URL', description: - 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object' + 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.' } ] } diff --git a/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx b/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx index 47d563463aef..83781a1506a6 100644 --- a/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx +++ b/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx @@ -129,7 +129,7 @@ for await (const partialObject of partialObjectStream) { name: 'image', type: 'string | Uint8Array | Buffer | ArrayBuffer | URL', description: - 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object' + 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.' } ] } diff --git a/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx b/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx index a7614711803e..a69dfbb26076 100644 --- a/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx +++ b/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx @@ -97,7 +97,7 @@ A helper function to create a streamable UI from LLM providers. This function is name: 'image', type: 'string | Uint8Array | Buffer | ArrayBuffer | URL', description: - 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object', + 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.', }, ], }, diff --git a/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts b/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts index 81570ce951a6..605083f9ce99 100644 --- a/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts +++ b/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts @@ -15,9 +15,8 @@ async function main() { { type: 'text', text: 'Describe the image in detail.' }, { type: 'image', - image: new URL( + image: 'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true', - ), }, ], }, diff --git a/examples/ai-core/src/generate-text/google-multimodal-url.ts b/examples/ai-core/src/generate-text/google-multimodal-url.ts index 790c2ce45dfd..13740a50e98b 100644 --- a/examples/ai-core/src/generate-text/google-multimodal-url.ts +++ b/examples/ai-core/src/generate-text/google-multimodal-url.ts @@ -15,9 +15,8 @@ async function main() { { type: 'text', text: 'Describe the image in detail.' }, { type: 'image', - image: new URL( + image: 'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true', - ), }, ], }, diff --git a/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts b/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts index c452f7c80783..548fd75796f3 100644 --- a/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts +++ b/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts @@ -14,9 +14,8 @@ async function main() { { type: 'text', text: 'Describe the image in detail.' }, { type: 'image', - image: new URL( + image: 'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true', - ), }, ], }, diff --git a/examples/ai-core/src/generate-text/openai-multimodal-url.ts b/examples/ai-core/src/generate-text/openai-multimodal-url.ts index 2c0aac2369cb..1975a0bc74e6 100644 --- a/examples/ai-core/src/generate-text/openai-multimodal-url.ts +++ b/examples/ai-core/src/generate-text/openai-multimodal-url.ts @@ -15,9 +15,8 @@ async function main() { { type: 'text', text: 'Describe the image in detail.' }, { type: 'image', - image: new URL( + image: 'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true', - ), }, ], }, diff --git a/packages/core/core/prompt/convert-to-language-model-prompt.test.ts b/packages/core/core/prompt/convert-to-language-model-prompt.test.ts index 7b5f99c4b35f..f751e71c451e 100644 --- a/packages/core/core/prompt/convert-to-language-model-prompt.test.ts +++ b/packages/core/core/prompt/convert-to-language-model-prompt.test.ts @@ -1,34 +1,85 @@ import { convertToLanguageModelMessage } from './convert-to-language-model-prompt'; describe('convertToLanguageModelMessage', () => { - describe('assistant message', () => { - it('should ignore empty text parts', async () => { - const result = convertToLanguageModelMessage({ - role: 'assistant', - content: [ - { - type: 'text', - text: '', - }, - { - type: 'tool-call', - toolName: 'toolName', - toolCallId: 'toolCallId', - args: {}, - }, - ], + describe('user message', () => { + describe('image parts', () => { + it('should convert image string https url to URL object', async () => { + const result = convertToLanguageModelMessage({ + role: 'user', + content: [ + { + type: 'image', + image: 'https://example.com/image.jpg', + }, + ], + }); + + expect(result).toEqual({ + role: 'user', + content: [ + { + type: 'image', + image: new URL('https://example.com/image.jpg'), + }, + ], + }); + }); + + it('should convert image string data url to base64 content', async () => { + const result = convertToLanguageModelMessage({ + role: 'user', + content: [ + { + type: 'image', + image: '', + }, + ], + }); + + expect(result).toEqual({ + role: 'user', + content: [ + { + type: 'image', + image: new Uint8Array([116, 101, 115, 116]), + mimeType: 'image/jpg', + }, + ], + }); }); + }); + }); + + describe('assistant message', () => { + describe('text parts', () => { + it('should ignore empty text parts', async () => { + const result = convertToLanguageModelMessage({ + role: 'assistant', + content: [ + { + type: 'text', + text: '', + }, + { + type: 'tool-call', + toolName: 'toolName', + toolCallId: 'toolCallId', + args: {}, + }, + ], + }); - expect(result).toEqual({ - role: 'assistant', - content: [ - { - type: 'tool-call', - args: {}, - toolCallId: 'toolCallId', - toolName: 'toolName', - }, - ], + expect(result).toEqual({ + role: 'assistant', + content: [ + { + type: 'tool-call', + args: {}, + toolCallId: 'toolCallId', + toolName: 'toolName', + }, + ], + }); }); }); }); diff --git a/packages/core/core/prompt/convert-to-language-model-prompt.ts b/packages/core/core/prompt/convert-to-language-model-prompt.ts index 48d4909c1003..62d75b49123a 100644 --- a/packages/core/core/prompt/convert-to-language-model-prompt.ts +++ b/packages/core/core/prompt/convert-to-language-model-prompt.ts @@ -9,6 +9,7 @@ import { detectImageMimeType } from '../util/detect-image-mimetype'; import { convertDataContentToUint8Array } from './data-content'; import { ValidatedPrompt } from './get-validated-prompt'; import { InvalidMessageRoleError } from './invalid-message-role-error'; +import { getErrorMessage } from '@ai-sdk/provider-utils'; export function convertToLanguageModelPrompt( prompt: ValidatedPrompt, @@ -80,6 +81,54 @@ export function convertToLanguageModelMessage( }; } + // try to convert string image parts to urls + if (typeof part.image === 'string') { + try { + const url = new URL(part.image); + + switch (url.protocol) { + case 'http:': + case 'https:': { + return { + type: 'image', + image: url, + mimeType: part.mimeType, + }; + } + case 'data:': { + try { + const [header, base64Content] = part.image.split(','); + const mimeType = header.split(';')[0].split(':')[1]; + + if (mimeType == null || base64Content == null) { + throw new Error('Invalid data URL format'); + } + + return { + type: 'image', + image: + convertDataContentToUint8Array(base64Content), + mimeType, + }; + } catch (error) { + throw new Error( + `Error processing data URL: ${getErrorMessage( + message, + )}`, + ); + } + } + default: { + throw new Error( + `Unsupported URL protocol: ${url.protocol}`, + ); + } + } + } catch (_ignored) { + // not a URL + } + } + const imageUint8 = convertDataContentToUint8Array(part.image); return {