diff --git a/.changeset/smart-ducks-fold.md b/.changeset/smart-ducks-fold.md
new file mode 100644
index 000000000000..2764bd255dce
--- /dev/null
+++ b/.changeset/smart-ducks-fold.md
@@ -0,0 +1,5 @@
+---
+'ai': patch
+---
+
+feat (core): support https and data url strings in image parts
diff --git a/content/docs/03-ai-sdk-core/03-prompts.mdx b/content/docs/03-ai-sdk-core/03-prompts.mdx
index 6e062001fd6d..17ab07649724 100644
--- a/content/docs/03-ai-sdk-core/03-prompts.mdx
+++ b/content/docs/03-ai-sdk-core/03-prompts.mdx
@@ -76,15 +76,27 @@ const result = await generateText({
Instead of sending a text in the `content` property, you can send an array of parts that include text and other data types.
Currently image and text parts are supported.
-For models that support multi-modal inputs, user messages can include images. An `image` can be a base64-encoded image (`string`), an `ArrayBuffer`, a `Uint8Array`,
-a `Buffer`, or a `URL` object. It is possible to mix text and multiple images.
+For models that support multi-modal inputs, user messages can include images. An `image` can be one of the following:
+
+- base64-encoded image:
+ - `string` with base-64 encoded content
+ - data URL `string`, e.g. `data:image/png;base64,...`
+- binary image:
+ - `ArrayBuffer`
+ - `Uint8Array`
+ - `Buffer`
+- URL:
+ - http(s) URL `string`, e.g. `https://example.com/image.png`
+ - `URL` object, e.g. `new URL('https://example.com/image.png')`
+
+It is possible to mix text and multiple images.
Not all models support all types of multi-modal inputs. Check the model's
capabilities before using this feature.
-#### Example: Buffer images
+#### Example: Binary image (Buffer)
```ts highlight="8-11"
const result = await generateText({
@@ -104,9 +116,7 @@ const result = await generateText({
});
```
-#### Example: Base-64 encoded images
-
-You do not need a `data:...` prefix for the base64-encoded image.
+#### Example: Base-64 encoded image (string)
```ts highlight="8-11"
const result = await generateText({
@@ -126,9 +136,9 @@ const result = await generateText({
});
```
-#### Example: Image URLs
+#### Example: Image URL (string)
-```ts highlight="8-13"
+```ts highlight="8-12"
const result = await generateText({
model: yourModel,
messages: [
@@ -138,9 +148,8 @@ const result = await generateText({
{ type: 'text', text: 'Describe the image in detail.' },
{
type: 'image',
- image: new URL(
+ image:
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true',
- ),
},
],
},
diff --git a/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx b/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx
index 3595827a7125..65f23de0a6b2 100644
--- a/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx
+++ b/content/docs/07-reference/ai-sdk-core/01-generate-text.mdx
@@ -107,7 +107,7 @@ console.log(text);
name: 'image',
type: 'string | Uint8Array | Buffer | ArrayBuffer | URL',
description:
- 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object',
+ 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.',
},
],
},
diff --git a/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx b/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx
index eb08ac104522..bbd2abb76c36 100644
--- a/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx
+++ b/content/docs/07-reference/ai-sdk-core/02-stream-text.mdx
@@ -109,7 +109,7 @@ for await (const textPart of textStream) {
name: 'image',
type: 'string | Uint8Array | Buffer | ArrayBuffer | URL',
description:
- 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object',
+ 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.',
},
],
},
diff --git a/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx b/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx
index 08843c1b0f60..3659fff13b99 100644
--- a/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx
+++ b/content/docs/07-reference/ai-sdk-core/03-generate-object.mdx
@@ -126,7 +126,7 @@ console.log(JSON.stringify(object, null, 2));
name: 'image',
type: 'string | Uint8Array | Buffer | ArrayBuffer | URL',
description:
- 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object'
+ 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.'
}
]
}
diff --git a/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx b/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx
index 47d563463aef..83781a1506a6 100644
--- a/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx
+++ b/content/docs/07-reference/ai-sdk-core/04-stream-object.mdx
@@ -129,7 +129,7 @@ for await (const partialObject of partialObjectStream) {
name: 'image',
type: 'string | Uint8Array | Buffer | ArrayBuffer | URL',
description:
- 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object'
+ 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.'
}
]
}
diff --git a/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx b/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx
index a7614711803e..a69dfbb26076 100644
--- a/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx
+++ b/content/docs/07-reference/ai-sdk-rsc/01-stream-ui.mdx
@@ -97,7 +97,7 @@ A helper function to create a streamable UI from LLM providers. This function is
name: 'image',
type: 'string | Uint8Array | Buffer | ArrayBuffer | URL',
description:
- 'The image content of the message part. String are base64 encoded content. URLs need to be represented with a URL object',
+ 'The image content of the message part. String are either base64 encoded content, base64 data URLs, or http(s) URLs.',
},
],
},
diff --git a/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts b/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts
index 81570ce951a6..605083f9ce99 100644
--- a/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts
+++ b/examples/ai-core/src/generate-text/anthropic-multimodal-url.ts
@@ -15,9 +15,8 @@ async function main() {
{ type: 'text', text: 'Describe the image in detail.' },
{
type: 'image',
- image: new URL(
+ image:
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true',
- ),
},
],
},
diff --git a/examples/ai-core/src/generate-text/google-multimodal-url.ts b/examples/ai-core/src/generate-text/google-multimodal-url.ts
index 790c2ce45dfd..13740a50e98b 100644
--- a/examples/ai-core/src/generate-text/google-multimodal-url.ts
+++ b/examples/ai-core/src/generate-text/google-multimodal-url.ts
@@ -15,9 +15,8 @@ async function main() {
{ type: 'text', text: 'Describe the image in detail.' },
{
type: 'image',
- image: new URL(
+ image:
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true',
- ),
},
],
},
diff --git a/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts b/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts
index c452f7c80783..548fd75796f3 100644
--- a/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts
+++ b/examples/ai-core/src/generate-text/google-vertex-multimodal-url.ts
@@ -14,9 +14,8 @@ async function main() {
{ type: 'text', text: 'Describe the image in detail.' },
{
type: 'image',
- image: new URL(
+ image:
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true',
- ),
},
],
},
diff --git a/examples/ai-core/src/generate-text/openai-multimodal-url.ts b/examples/ai-core/src/generate-text/openai-multimodal-url.ts
index 2c0aac2369cb..1975a0bc74e6 100644
--- a/examples/ai-core/src/generate-text/openai-multimodal-url.ts
+++ b/examples/ai-core/src/generate-text/openai-multimodal-url.ts
@@ -15,9 +15,8 @@ async function main() {
{ type: 'text', text: 'Describe the image in detail.' },
{
type: 'image',
- image: new URL(
+ image:
'https://github.com/vercel/ai/blob/main/examples/ai-core/data/comic-cat.png?raw=true',
- ),
},
],
},
diff --git a/packages/core/core/prompt/convert-to-language-model-prompt.test.ts b/packages/core/core/prompt/convert-to-language-model-prompt.test.ts
index 7b5f99c4b35f..f751e71c451e 100644
--- a/packages/core/core/prompt/convert-to-language-model-prompt.test.ts
+++ b/packages/core/core/prompt/convert-to-language-model-prompt.test.ts
@@ -1,34 +1,85 @@
import { convertToLanguageModelMessage } from './convert-to-language-model-prompt';
describe('convertToLanguageModelMessage', () => {
- describe('assistant message', () => {
- it('should ignore empty text parts', async () => {
- const result = convertToLanguageModelMessage({
- role: 'assistant',
- content: [
- {
- type: 'text',
- text: '',
- },
- {
- type: 'tool-call',
- toolName: 'toolName',
- toolCallId: 'toolCallId',
- args: {},
- },
- ],
+ describe('user message', () => {
+ describe('image parts', () => {
+ it('should convert image string https url to URL object', async () => {
+ const result = convertToLanguageModelMessage({
+ role: 'user',
+ content: [
+ {
+ type: 'image',
+ image: 'https://example.com/image.jpg',
+ },
+ ],
+ });
+
+ expect(result).toEqual({
+ role: 'user',
+ content: [
+ {
+ type: 'image',
+ image: new URL('https://example.com/image.jpg'),
+ },
+ ],
+ });
+ });
+
+ it('should convert image string data url to base64 content', async () => {
+ const result = convertToLanguageModelMessage({
+ role: 'user',
+ content: [
+ {
+ type: 'image',
+ image: 'data:image/jpg;base64,dGVzdA==',
+ },
+ ],
+ });
+
+ expect(result).toEqual({
+ role: 'user',
+ content: [
+ {
+ type: 'image',
+ image: new Uint8Array([116, 101, 115, 116]),
+ mimeType: 'image/jpg',
+ },
+ ],
+ });
});
+ });
+ });
+
+ describe('assistant message', () => {
+ describe('text parts', () => {
+ it('should ignore empty text parts', async () => {
+ const result = convertToLanguageModelMessage({
+ role: 'assistant',
+ content: [
+ {
+ type: 'text',
+ text: '',
+ },
+ {
+ type: 'tool-call',
+ toolName: 'toolName',
+ toolCallId: 'toolCallId',
+ args: {},
+ },
+ ],
+ });
- expect(result).toEqual({
- role: 'assistant',
- content: [
- {
- type: 'tool-call',
- args: {},
- toolCallId: 'toolCallId',
- toolName: 'toolName',
- },
- ],
+ expect(result).toEqual({
+ role: 'assistant',
+ content: [
+ {
+ type: 'tool-call',
+ args: {},
+ toolCallId: 'toolCallId',
+ toolName: 'toolName',
+ },
+ ],
+ });
});
});
});
diff --git a/packages/core/core/prompt/convert-to-language-model-prompt.ts b/packages/core/core/prompt/convert-to-language-model-prompt.ts
index 48d4909c1003..62d75b49123a 100644
--- a/packages/core/core/prompt/convert-to-language-model-prompt.ts
+++ b/packages/core/core/prompt/convert-to-language-model-prompt.ts
@@ -9,6 +9,7 @@ import { detectImageMimeType } from '../util/detect-image-mimetype';
import { convertDataContentToUint8Array } from './data-content';
import { ValidatedPrompt } from './get-validated-prompt';
import { InvalidMessageRoleError } from './invalid-message-role-error';
+import { getErrorMessage } from '@ai-sdk/provider-utils';
export function convertToLanguageModelPrompt(
prompt: ValidatedPrompt,
@@ -80,6 +81,54 @@ export function convertToLanguageModelMessage(
};
}
+ // try to convert string image parts to urls
+ if (typeof part.image === 'string') {
+ try {
+ const url = new URL(part.image);
+
+ switch (url.protocol) {
+ case 'http:':
+ case 'https:': {
+ return {
+ type: 'image',
+ image: url,
+ mimeType: part.mimeType,
+ };
+ }
+ case 'data:': {
+ try {
+ const [header, base64Content] = part.image.split(',');
+ const mimeType = header.split(';')[0].split(':')[1];
+
+ if (mimeType == null || base64Content == null) {
+ throw new Error('Invalid data URL format');
+ }
+
+ return {
+ type: 'image',
+ image:
+ convertDataContentToUint8Array(base64Content),
+ mimeType,
+ };
+ } catch (error) {
+ throw new Error(
+ `Error processing data URL: ${getErrorMessage(
+ message,
+ )}`,
+ );
+ }
+ }
+ default: {
+ throw new Error(
+ `Unsupported URL protocol: ${url.protocol}`,
+ );
+ }
+ }
+ } catch (_ignored) {
+ // not a URL
+ }
+ }
+
const imageUint8 = convertDataContentToUint8Array(part.image);
return {