From e4d61ec470c62ce6d6cd0c96c6d3c54e20fd1ce3 Mon Sep 17 00:00:00 2001 From: Tom Daniel Date: Thu, 21 Nov 2024 19:23:42 +0000 Subject: [PATCH 1/4] Completed image generation for Telegram --- .env.example | 1 + .gitignore | 5 ++ agent/src/index.ts | 1 + mise.toml | 3 + .../client-telegram/src/messageManager.ts | 18 ++++- packages/client-twitter/src/post.ts | 2 +- packages/core/src/generation.ts | 42 +++++++---- packages/core/src/types.ts | 2 + packages/plugin-image-generation/src/index.ts | 73 ++++++++++++++++--- 9 files changed, 119 insertions(+), 28 deletions(-) create mode 100644 mise.toml diff --git a/.env.example b/.env.example index 3771417499..e023a580f1 100644 --- a/.env.example +++ b/.env.example @@ -82,6 +82,7 @@ TELEGRAM_BOT_TOKEN= # Together Configuration TOGETHER_API_KEY= +TOGETHER_IMAGE_MODEL= #Leave blank for default black-forest-labs/FLUX.1-schnell # Server Configuration SERVER_PORT=3000 diff --git a/.gitignore b/.gitignore index c1bf4fc6dc..c781ad5fe2 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,8 @@ packages/core/src/providers/cache/* cache/* packages/plugin-coinbase/src/plugins/transactions.csv packages/plugin-coinbase/package-lock.json + +*.png +*.jpg +*.jpeg +*.webp diff --git a/agent/src/index.ts b/agent/src/index.ts index 0077beb806..1cae261b62 100644 --- a/agent/src/index.ts +++ b/agent/src/index.ts @@ -38,6 +38,7 @@ import yargs from "yargs"; import path from "path"; import { fileURLToPath } from "url"; import { character } from "./character.ts"; +import { imageGenerationPlugin } from "@ai16z/plugin-image-generation"; import type { DirectClient } from "@ai16z/client-direct"; const __filename = fileURLToPath(import.meta.url); // get the resolved path to the file diff --git a/mise.toml b/mise.toml new file mode 100644 index 0000000000..e4fd296c73 --- /dev/null +++ b/mise.toml @@ -0,0 +1,3 @@ +[tools] +node = "23.1.0" +pnpm = "latest" diff --git a/packages/client-telegram/src/messageManager.ts b/packages/client-telegram/src/messageManager.ts index 8c7fe49c63..4090693cb3 100644 --- a/packages/client-telegram/src/messageManager.ts +++ b/packages/client-telegram/src/messageManager.ts @@ -1,8 +1,10 @@ import { Message } from "@telegraf/types"; -import { Context, Telegraf } from "telegraf"; +import { Context, Telegraf, Input } from "telegraf"; import { composeContext, elizaLogger, ServiceType } from "@ai16z/eliza"; import { embeddingZeroVector } from "@ai16z/eliza"; +import { Media } from "@ai16z/eliza"; +import { elizaLogger } from "@ai16z/eliza"; import { Content, HandlerCallback, @@ -125,10 +127,11 @@ Note that {{agentName}} is capable of reading/seeing/hearing various forms of me {{recentMessages}} -# Task: Generate a post/reply in the voice, style and perspective of {{agentName}} (@{{twitterUserName}}) while using the thread of tweets as additional context: +# Task: Generate a reply in the voice and style of {{agentName}}, aka @{{twitterUserName}} +Write a very short reply that is from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the reply. Use the thread of tweets as additional context: Current Post: {{currentPost}} -Thread of Tweets You Are Replying To: +Thread of messages you are replying to: {{formattedConversation}} ` + messageCompletionFooter; @@ -235,10 +238,18 @@ export class MessageManager { private async sendMessageInChunks( ctx: Context, content: string, + attachments?: Media[], replyToMessageId?: number ): Promise { const chunks = this.splitMessage(content); const sentMessages: Message.TextMessage[] = []; + const hasAttachment = attachments?.length > 0; + + if (hasAttachment) { + const sentMessage = (await ctx.replyWithPhoto(Input.fromLocalFile(attachments[0].url))); + + elizaLogger.log("Sent attachment: ", sentMessage); + } for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; @@ -443,6 +454,7 @@ export class MessageManager { const sentMessages = await this.sendMessageInChunks( ctx, content.text, + content.attachments, message.message_id ); diff --git a/packages/client-twitter/src/post.ts b/packages/client-twitter/src/post.ts index 6b02f41bbf..af5bfdbb8c 100644 --- a/packages/client-twitter/src/post.ts +++ b/packages/client-twitter/src/post.ts @@ -28,7 +28,7 @@ About {{agentName}} (@{{twitterUserName}}): # Task: Generate a post in the voice and style of {{agentName}}, aka @{{twitterUserName}} Write a single sentence post that is {{adjective}} about {{topic}} (without mentioning {{topic}} directly), from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the post. -Your response should not contain any questions. Brief, concise statements only. No emojis. Use \\n\\n (double spaces) between statements.`; +Your response should not contain any questions. Brief, concise statements only. Use \\n\\n (double spaces) between statements.`; const MAX_TWEET_LENGTH = 280; diff --git a/packages/core/src/generation.ts b/packages/core/src/generation.ts index 553df59e17..67607cf28e 100644 --- a/packages/core/src/generation.ts +++ b/packages/core/src/generation.ts @@ -768,15 +768,27 @@ export const generateImage = async ( count = 1; } - const model = getModel(runtime.character.modelProvider, ModelClass.IMAGE); - const modelSettings = models[runtime.character.modelProvider].imageSettings; - const apiKey = - runtime.token ?? - runtime.getSetting("HEURIST_API_KEY") ?? - runtime.getSetting("TOGETHER_API_KEY") ?? - runtime.getSetting("OPENAI_API_KEY"); + const imageModelProvider = + runtime.character.imageModelProvider ?? runtime.character.modelProvider; + + elizaLogger.log("imageModelProvider: ", imageModelProvider); + + const model = getModel(imageModelProvider, ModelClass.IMAGE); + const modelSettings = models[imageModelProvider].imageSettings; + let apiKey = runtime.token; + switch (imageModelProvider) { + case ModelProviderName.HEURIST: + apiKey = runtime.getSetting("HEURIST_API_KEY"); + break; + case ModelProviderName.LLAMACLOUD: + apiKey = runtime.getSetting("TOGETHER_API_KEY"); + break; + case ModelProviderName.OPENAI: + apiKey = runtime.getSetting("OPENAI_API_KEY"); + break; + } try { - if (runtime.character.modelProvider === ModelProviderName.HEURIST) { + if (imageModelProvider === ModelProviderName.HEURIST) { const response = await fetch( "http://sequencer.heurist.xyz/submit_job", { @@ -814,11 +826,11 @@ export const generateImage = async ( const imageURL = await response.json(); return { success: true, data: [imageURL] }; } else if ( - runtime.character.modelProvider === ModelProviderName.LLAMACLOUD + imageModelProvider === ModelProviderName.LLAMACLOUD ) { const together = new Together({ apiKey: apiKey as string }); const response = await together.images.create({ - model: "black-forest-labs/FLUX.1-schnell", + model: runtime.getSetting("TOGETHER_IMAGE_MODEL") ?? "black-forest-labs/FLUX.1-schnell", prompt, width, height, @@ -827,11 +839,14 @@ export const generateImage = async ( }); const urls: string[] = []; for (let i = 0; i < response.data.length; i++) { - const json = response.data[i].b64_json; + //const json = response.data[i].b64_json; // decode base64 - const base64 = Buffer.from(json, "base64").toString("base64"); - urls.push(base64); + //const base64 = Buffer.from(json, "base64").toString("base64"); + //urls.push(base64); + const data: unknown = response.data + urls.push(data[i].url); } + const base64s = await Promise.all( urls.map(async (url) => { const response = await fetch(url); @@ -842,6 +857,7 @@ export const generateImage = async ( return base64; }) ); + return { success: true, data: base64s }; } else { let targetSize = `${width}x${height}`; diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 733aa7f831..0a198459ba 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -602,6 +602,8 @@ export type Character = { /** Model provider to use */ modelProvider: ModelProviderName; + /** Optional image model provider */ + imageModelProvider?: ModelProviderName; /** Optional model endpoint override */ modelEndpointOverride?: string; diff --git a/packages/plugin-image-generation/src/index.ts b/packages/plugin-image-generation/src/index.ts index ab95d0c3f3..c3ed3ff308 100644 --- a/packages/plugin-image-generation/src/index.ts +++ b/packages/plugin-image-generation/src/index.ts @@ -1,9 +1,12 @@ import { elizaLogger } from "@ai16z/eliza"; import { Action, + composeContext, + generateText, HandlerCallback, IAgentRuntime, Memory, + ModelClass, Plugin, State, } from "@ai16z/eliza"; @@ -13,6 +16,22 @@ import fs from "fs"; import path from "path"; import { validateImageGenConfig } from "./enviroment"; +const imagePromptTemplate = `# Knowledge +{{knowledge}} + +About {{agentName}}: +{{bio}} +{{lore}} +{{postDirections}} + +{{providers}} + +{{recentPosts}} + +# Task: Generate an image description in the voice and style of {{agentName}} according to the previous . +Write a two sentence image description that considers the and may also include {{adjective}} about {{topic}} (without mentioning {{topic}} directly), from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the description of the image to be generated. +Your response should not contain any questions. Brief, concise statements only. No emojis. Use \\n\\n (double spaces) between statements.`; + export function saveBase64Image(base64Data: string, filename: string): string { // Create generatedImages directory if it doesn't exist const imageDir = path.join(process.cwd(), "generatedImages"); @@ -94,15 +113,34 @@ const imageGeneration: Action = { options: any, callback: HandlerCallback ) => { - elizaLogger.log("Composing state for message:", message); - state = (await runtime.composeState(message)) as State; + const agentContext = composeContext({ + state, + template: + runtime.character.templates?.imagePromptTemplate || + imagePromptTemplate, + }); + + const agentImagePrompt = await generateText({ + runtime, + context: `${agentContext}\n\n${message.content.text}`, + modelClass: ModelClass.SMALL, + }); + + elizaLogger.log("Agent image prompt:", agentImagePrompt); + + + //state = (await runtime.composeState(message)) as State; const userId = runtime.agentId; elizaLogger.log("User ID:", userId); - const imagePrompt = message.content.text; - elizaLogger.log("Image prompt received:", imagePrompt); + const context = `You are an AI assistant specialized in crafting effective prompts for image generation. Your task is to analyze a user's message and create a comprehensive, natural-language prompt that will guide an image generation algorithm to produce high-quality, visually appealing images.\n\nHere is the user's message:\n ${agentImagePrompt} \n\nBegin by analyzing the content of the user's message. Follow these steps:\n\n1. List out key elements from the user's message, categorizing them to ensure comprehensive coverage:\n * Topic: The main subject or scene with specific details\n * Material: The medium or style (e.g., digital painting, 3D render)\n * Style: The artistic direction (e.g., fantasy, vaporwave)\n * Artist: Specific artists to influence the visual style\n * Webpage Influence: Art platforms like ArtStation or DeviantArt for quality enhancement\n * Sharpness: Terms like "sharp focus" or "highly detailed" for clarity\n * Extra Details: Descriptors to enhance atmosphere (e.g., cinematic, dystopian)\n * Shade and Color: Color-related keywords to control mood (e.g., moody lighting)\n * Lighting and Brightness: Specific lighting styles (e.g., dramatic shadows)\n * Camera Angle: Perspective and framing (e.g., close-up, wide shot, aerial view)\n * Composition: Layout guidance (e.g., rule of thirds, centered, dynamic)\n * Time Period: Temporal context if relevant\n * Cultural Elements: Any specific cultural influences\n * Textures: Surface quality descriptions\n * Weather/Atmosphere: Environmental conditions if applicable\n * Negative Prompts: Elements to exclude from the image\n\n2. Brainstorm complementary elements that would enhance the user's vision:\n * Suggest fitting artists and styles if not specified\n * Consider atmospheric elements that would strengthen the concept\n * Identify potential technical aspects that would improve the result\n * Note any elements that should be avoided to maintain the desired look\n\n3. Construct your final prompt by:\n * Leading with the most important scene/subject details from the user's message\n * Incorporating all relevant technical and stylistic elements\n * Grouping related concepts together naturally\n * Maintaining clear, flowing language throughout\n * Adding complementary details that enhance but don't alter the core concept\n * Concluding with negative prompts separated by a "Negative:" marker\n\nRemember:\n- Preserve ALL specific details from the user's original message\n- Don't force details into a rigid template\n- Create a cohesive, readable description\n- Keep the focus on the user's core concept while enhancing it with technical and artistic refinements\n\nYour output should contain ONLY the final prompt text, with no additional explanations, tags, or formatting.`; - // TODO: Generate a prompt for the image + const imagePrompt = await generateText({ + runtime, + context, + modelClass: ModelClass.SMALL, + }); + elizaLogger.log("Image prompt received:", imagePrompt); const res: { image: string; caption: string }[] = []; @@ -148,7 +186,7 @@ const imageGeneration: Action = { elizaLogger.error("Caption generation failed, using default caption:", error); }*/ - const _caption = "..."; + //const caption = "..."; /*= await generateCaption( { imageUrl: image, @@ -156,25 +194,25 @@ const imageGeneration: Action = { runtime );*/ - res.push({ image: filepath, caption: "..." }); //caption.title }); + res.push({ image: filepath, caption: agentImagePrompt }); //caption.title }); elizaLogger.log( `Generated caption for image ${i + 1}:`, - "..." //caption.title + agentImagePrompt //caption.title ); //res.push({ image: image, caption: caption.title }); callback( { - text: "...", //caption.description, + text: agentImagePrompt, //caption.description, attachments: [ { id: crypto.randomUUID(), url: filepath, title: "Generated image", source: "imageGeneration", - description: "...", //caption.title, - text: "...", //caption.description, + description: imagePrompt, //caption.title, + text: agentImagePrompt, //caption.description, }, ], }, @@ -258,6 +296,19 @@ const imageGeneration: Action = { }, }, ], + [ + { + user: "{{user1}}", + content: { text: "Show me a picture of you" }, + }, + { + user: "{{agentName}}", + content: { + text: "Here's a picture of me", + action: "GENERATE_IMAGE", + }, + }, + ], ], } as Action; From ead313f601be8391d51613062e4f9e0f80bcd453 Mon Sep 17 00:00:00 2001 From: Tom Daniel Date: Mon, 25 Nov 2024 11:54:44 +0000 Subject: [PATCH 2/4] remove obsolete code --- packages/core/src/generation.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/core/src/generation.ts b/packages/core/src/generation.ts index 67607cf28e..a23080ce08 100644 --- a/packages/core/src/generation.ts +++ b/packages/core/src/generation.ts @@ -839,10 +839,6 @@ export const generateImage = async ( }); const urls: string[] = []; for (let i = 0; i < response.data.length; i++) { - //const json = response.data[i].b64_json; - // decode base64 - //const base64 = Buffer.from(json, "base64").toString("base64"); - //urls.push(base64); const data: unknown = response.data urls.push(data[i].url); } From 3f0a4310f37fd54fd85b52c7c9ba4cbbc2e5de4c Mon Sep 17 00:00:00 2001 From: Tom Daniel Date: Mon, 25 Nov 2024 12:11:38 +0000 Subject: [PATCH 3/4] make image generation prompt a config --- .env.example | 4 ++++ packages/core/src/types.ts | 2 ++ packages/plugin-image-generation/src/index.ts | 9 ++++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index e023a580f1..fe5fbebc8a 100644 --- a/.env.example +++ b/.env.example @@ -41,6 +41,10 @@ POST_INTERVAL_MAX= # Default: 180 IMAGE_GEN= # Set to TRUE to enable image generation USE_OPENAI_EMBEDDING= # Set to TRUE for OpenAI, leave blank for local +#Generation Prompts +SYSTEM_PROMPT= # Leave blank for empty system prompt or defined in character config +IMAGE_GENERATION_PROMPT= # Leave blank for default image generation prompt or defined in character config + # OpenRouter Models OPENROUTER_MODEL= # Default: uses hermes 70b/405b SMALL_OPENROUTER_MODEL= diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 0a198459ba..3af5c4cb1f 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -599,6 +599,8 @@ export type Character = { /** Optional system prompt */ system?: string; + /** Optional image generation prompt */ + imageGenerationPrompt?: string; /** Model provider to use */ modelProvider: ModelProviderName; diff --git a/packages/plugin-image-generation/src/index.ts b/packages/plugin-image-generation/src/index.ts index c3ed3ff308..664beb24b4 100644 --- a/packages/plugin-image-generation/src/index.ts +++ b/packages/plugin-image-generation/src/index.ts @@ -10,7 +10,7 @@ import { Plugin, State, } from "@ai16z/eliza"; -import { generateImage } from "@ai16z/eliza"; +import { generateCaption, generateImage, settings } from "@ai16z/eliza"; import fs from "fs"; import path from "path"; @@ -32,6 +32,8 @@ About {{agentName}}: Write a two sentence image description that considers the and may also include {{adjective}} about {{topic}} (without mentioning {{topic}} directly), from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the description of the image to be generated. Your response should not contain any questions. Brief, concise statements only. No emojis. Use \\n\\n (double spaces) between statements.`; +const imageGenerationPrompt = "You are an AI assistant specialized in crafting effective prompts for image generation. Your task is to analyze a user's message and create a comprehensive, natural-language prompt that will guide an image generation algorithm to produce high-quality, visually appealing images.\n\nBegin by analyzing the content of the user's message. Follow these steps:\n\n1. List out key elements from the user's message, categorizing them to ensure comprehensive coverage:\n * Topic: The main subject or scene with specific details\n * Material: The medium or style (e.g., digital painting, 3D render)\n * Style: The artistic direction (e.g., fantasy, vaporwave)\n * Artist: Specific artists to influence the visual style\n * Webpage Influence: Art platforms like ArtStation or DeviantArt for quality enhancement\n * Sharpness: Terms like \"sharp focus\" or \"highly detailed\" for clarity\n * Extra Details: Descriptors to enhance atmosphere (e.g., cinematic, dystopian)\n * Shade and Color: Color-related keywords to control mood (e.g., moody lighting)\n * Lighting and Brightness: Specific lighting styles (e.g., dramatic shadows)\n * Camera Angle: Perspective and framing (e.g., close-up, wide shot, aerial view)\n * Composition: Layout guidance (e.g., rule of thirds, centered, dynamic)\n * Time Period: Temporal context if relevant\n * Cultural Elements: Any specific cultural influences\n * Textures: Surface quality descriptions\n * Weather/Atmosphere: Environmental conditions if applicable\n * Negative Prompts: Elements to exclude from the image\n\n2. Brainstorm complementary elements that would enhance the user's vision:\n * Suggest fitting artists and styles if not specified\n * Consider atmospheric elements that would strengthen the concept\n * Identify potential technical aspects that would improve the result\n * Note any elements that should be avoided to maintain the desired look\n\n3. Construct your final prompt by:\n * Leading with the most important scene/subject details from the user's message\n * Incorporating all relevant technical and stylistic elements\n * Grouping related concepts together naturally\n * Maintaining clear, flowing language throughout\n * Adding complementary details that enhance but don't alter the core concept\n * Concluding with negative prompts separated by a \"Negative:\" marker\n\nRemember:\n- Preserve ALL specific details from the user's original message\n- Don't force details into a rigid template\n- Create a cohesive, readable description\n- Keep the focus on the user's core concept while enhancing it with technical and artistic refinements\n\nYour output should contain ONLY the final prompt text, with no additional explanations, tags, or formatting."; + export function saveBase64Image(base64Data: string, filename: string): string { // Create generatedImages directory if it doesn't exist const imageDir = path.join(process.cwd(), "generatedImages"); @@ -122,7 +124,7 @@ const imageGeneration: Action = { const agentImagePrompt = await generateText({ runtime, - context: `${agentContext}\n\n${message.content.text}`, + context: `${agentContext}\n\n${message.content.text}`, modelClass: ModelClass.SMALL, }); @@ -133,7 +135,8 @@ const imageGeneration: Action = { const userId = runtime.agentId; elizaLogger.log("User ID:", userId); - const context = `You are an AI assistant specialized in crafting effective prompts for image generation. Your task is to analyze a user's message and create a comprehensive, natural-language prompt that will guide an image generation algorithm to produce high-quality, visually appealing images.\n\nHere is the user's message:\n ${agentImagePrompt} \n\nBegin by analyzing the content of the user's message. Follow these steps:\n\n1. List out key elements from the user's message, categorizing them to ensure comprehensive coverage:\n * Topic: The main subject or scene with specific details\n * Material: The medium or style (e.g., digital painting, 3D render)\n * Style: The artistic direction (e.g., fantasy, vaporwave)\n * Artist: Specific artists to influence the visual style\n * Webpage Influence: Art platforms like ArtStation or DeviantArt for quality enhancement\n * Sharpness: Terms like "sharp focus" or "highly detailed" for clarity\n * Extra Details: Descriptors to enhance atmosphere (e.g., cinematic, dystopian)\n * Shade and Color: Color-related keywords to control mood (e.g., moody lighting)\n * Lighting and Brightness: Specific lighting styles (e.g., dramatic shadows)\n * Camera Angle: Perspective and framing (e.g., close-up, wide shot, aerial view)\n * Composition: Layout guidance (e.g., rule of thirds, centered, dynamic)\n * Time Period: Temporal context if relevant\n * Cultural Elements: Any specific cultural influences\n * Textures: Surface quality descriptions\n * Weather/Atmosphere: Environmental conditions if applicable\n * Negative Prompts: Elements to exclude from the image\n\n2. Brainstorm complementary elements that would enhance the user's vision:\n * Suggest fitting artists and styles if not specified\n * Consider atmospheric elements that would strengthen the concept\n * Identify potential technical aspects that would improve the result\n * Note any elements that should be avoided to maintain the desired look\n\n3. Construct your final prompt by:\n * Leading with the most important scene/subject details from the user's message\n * Incorporating all relevant technical and stylistic elements\n * Grouping related concepts together naturally\n * Maintaining clear, flowing language throughout\n * Adding complementary details that enhance but don't alter the core concept\n * Concluding with negative prompts separated by a "Negative:" marker\n\nRemember:\n- Preserve ALL specific details from the user's original message\n- Don't force details into a rigid template\n- Create a cohesive, readable description\n- Keep the focus on the user's core concept while enhancing it with technical and artistic refinements\n\nYour output should contain ONLY the final prompt text, with no additional explanations, tags, or formatting.`; + const context = runtime.character.system ?? + settings.SYSTEM_PROMPT ?? imageGenerationPrompt + `\n\nHere is the user's message:\n ${agentImagePrompt} `; const imagePrompt = await generateText({ runtime, From 1f2b4b608d80043b27bb156a90ffa3eb49832757 Mon Sep 17 00:00:00 2001 From: Tom Daniel Date: Mon, 25 Nov 2024 12:29:29 +0000 Subject: [PATCH 4/4] add comments and remove obsolete code --- packages/plugin-image-generation/src/index.ts | 90 +++++++++++-------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/packages/plugin-image-generation/src/index.ts b/packages/plugin-image-generation/src/index.ts index 664beb24b4..e805446b1f 100644 --- a/packages/plugin-image-generation/src/index.ts +++ b/packages/plugin-image-generation/src/index.ts @@ -16,6 +16,11 @@ import fs from "fs"; import path from "path"; import { validateImageGenConfig } from "./enviroment"; +/** + * Template for generating image descriptions in the agent's voice. + * Uses various context elements like knowledge, bio, and recent posts + * to maintain consistent character voice. + */ const imagePromptTemplate = `# Knowledge {{knowledge}} @@ -29,11 +34,21 @@ About {{agentName}}: {{recentPosts}} # Task: Generate an image description in the voice and style of {{agentName}} according to the previous . -Write a two sentence image description that considers the and may also include {{adjective}} about {{topic}} (without mentioning {{topic}} directly), from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the description of the image to be generated. +Write a short image description that considers the complemented by {{adjective}} about {{topic}} from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the description of the image to be generated. Your response should not contain any questions. Brief, concise statements only. No emojis. Use \\n\\n (double spaces) between statements.`; +/** + * Prompt for the image generation AI to create detailed, high-quality prompts + * that will produce visually appealing images. + */ const imageGenerationPrompt = "You are an AI assistant specialized in crafting effective prompts for image generation. Your task is to analyze a user's message and create a comprehensive, natural-language prompt that will guide an image generation algorithm to produce high-quality, visually appealing images.\n\nBegin by analyzing the content of the user's message. Follow these steps:\n\n1. List out key elements from the user's message, categorizing them to ensure comprehensive coverage:\n * Topic: The main subject or scene with specific details\n * Material: The medium or style (e.g., digital painting, 3D render)\n * Style: The artistic direction (e.g., fantasy, vaporwave)\n * Artist: Specific artists to influence the visual style\n * Webpage Influence: Art platforms like ArtStation or DeviantArt for quality enhancement\n * Sharpness: Terms like \"sharp focus\" or \"highly detailed\" for clarity\n * Extra Details: Descriptors to enhance atmosphere (e.g., cinematic, dystopian)\n * Shade and Color: Color-related keywords to control mood (e.g., moody lighting)\n * Lighting and Brightness: Specific lighting styles (e.g., dramatic shadows)\n * Camera Angle: Perspective and framing (e.g., close-up, wide shot, aerial view)\n * Composition: Layout guidance (e.g., rule of thirds, centered, dynamic)\n * Time Period: Temporal context if relevant\n * Cultural Elements: Any specific cultural influences\n * Textures: Surface quality descriptions\n * Weather/Atmosphere: Environmental conditions if applicable\n * Negative Prompts: Elements to exclude from the image\n\n2. Brainstorm complementary elements that would enhance the user's vision:\n * Suggest fitting artists and styles if not specified\n * Consider atmospheric elements that would strengthen the concept\n * Identify potential technical aspects that would improve the result\n * Note any elements that should be avoided to maintain the desired look\n\n3. Construct your final prompt by:\n * Leading with the most important scene/subject details from the user's message\n * Incorporating all relevant technical and stylistic elements\n * Grouping related concepts together naturally\n * Maintaining clear, flowing language throughout\n * Adding complementary details that enhance but don't alter the core concept\n * Concluding with negative prompts separated by a \"Negative:\" marker\n\nRemember:\n- Preserve ALL specific details from the user's original message\n- Don't force details into a rigid template\n- Create a cohesive, readable description\n- Keep the focus on the user's core concept while enhancing it with technical and artistic refinements\n\nYour output should contain ONLY the final prompt text, with no additional explanations, tags, or formatting."; +/** + * Saves a base64-encoded image to the local filesystem + * @param base64Data - The base64-encoded image data + * @param filename - Name to use for the saved file (without extension) + * @returns The full filepath where the image was saved + */ export function saveBase64Image(base64Data: string, filename: string): string { // Create generatedImages directory if it doesn't exist const imageDir = path.join(process.cwd(), "generatedImages"); @@ -56,6 +71,12 @@ export function saveBase64Image(base64Data: string, filename: string): string { return filepath; } +/** + * Saves an image from a Heurist URL to the local filesystem + * @param imageUrl - URL of the image to download and save + * @param filename - Name to use for the saved file (without extension) + * @returns Promise resolving to the full filepath where the image was saved + */ export async function saveHeuristImage( imageUrl: string, filename: string @@ -83,8 +104,13 @@ export async function saveHeuristImage( return filepath; } +/** + * Action definition for image generation capability + * Handles generating images based on user prompts while maintaining agent personality + */ const imageGeneration: Action = { name: "GENERATE_IMAGE", + // Alternative action names that should trigger image generation similes: [ "IMAGE_GENERATION", "IMAGE_GEN", @@ -97,6 +123,10 @@ const imageGeneration: Action = { "MAKE_A", ], description: "Generate an image to go along with the message.", + + /** + * Validates that required API keys are present for image generation + */ validate: async (runtime: IAgentRuntime, _message: Memory) => { await validateImageGenConfig(runtime); @@ -108,6 +138,14 @@ const imageGeneration: Action = { return anthropicApiKeyOk || togetherApiKeyOk || heuristApiKeyOk; }, + + /** + * Main handler for image generation: + * 1. Generates an image description in the agent's voice + * 2. Converts that description into an optimized image generation prompt + * 3. Generates the image + * 4. Saves and returns the result + */ handler: async ( runtime: IAgentRuntime, message: Memory, @@ -122,31 +160,32 @@ const imageGeneration: Action = { imagePromptTemplate, }); + // Generate the initial prompt in agent's voice const agentImagePrompt = await generateText({ runtime, context: `${agentContext}\n\n${message.content.text}`, modelClass: ModelClass.SMALL, }); - elizaLogger.log("Agent image prompt:", agentImagePrompt); - + elizaLogger.log("Agent prompt & caption for image: ", agentImagePrompt); - //state = (await runtime.composeState(message)) as State; const userId = runtime.agentId; elizaLogger.log("User ID:", userId); + // Convert agent's description into an optimized image generation prompt const context = runtime.character.system ?? settings.SYSTEM_PROMPT ?? imageGenerationPrompt + `\n\nHere is the user's message:\n ${agentImagePrompt} `; + // Generate the technical prompt for the image generation model const imagePrompt = await generateText({ runtime, context, modelClass: ModelClass.SMALL, }); - elizaLogger.log("Image prompt received:", imagePrompt); const res: { image: string; caption: string }[] = []; + // Generate the actual image elizaLogger.log("Generating image with prompt:", imagePrompt); const images = await generateImage( { @@ -158,6 +197,7 @@ const imageGeneration: Action = { runtime ); + // Process and save generated images if (images.success && images.data && images.data.length > 0) { elizaLogger.log( "Image generation successful, number of images:", @@ -176,46 +216,19 @@ const imageGeneration: Action = { elizaLogger.log(`Processing image ${i + 1}:`, filename); - //just dont even add a caption or a description just have it generate & send - /* - try { - const imageService = runtime.getService(ServiceType.IMAGE_DESCRIPTION); - if (imageService && typeof imageService.describeImage === 'function') { - const caption = await imageService.describeImage({ imageUrl: filepath }); - captionText = caption.description; - captionTitle = caption.title; - } - } catch (error) { - elizaLogger.error("Caption generation failed, using default caption:", error); - }*/ - - //const caption = "..."; - /*= await generateCaption( - { - imageUrl: image, - }, - runtime - );*/ - - res.push({ image: filepath, caption: agentImagePrompt }); //caption.title }); - - elizaLogger.log( - `Generated caption for image ${i + 1}:`, - agentImagePrompt //caption.title - ); - //res.push({ image: image, caption: caption.title }); + res.push({ image: filepath, caption: agentImagePrompt }); callback( { - text: agentImagePrompt, //caption.description, + text: agentImagePrompt, attachments: [ { id: crypto.randomUUID(), url: filepath, title: "Generated image", source: "imageGeneration", - description: imagePrompt, //caption.title, - text: agentImagePrompt, //caption.description, + description: imagePrompt, + text: agentImagePrompt, }, ], }, @@ -231,6 +244,8 @@ const imageGeneration: Action = { elizaLogger.error("Image generation failed or returned no data."); } }, + + // Example interactions that should trigger image generation examples: [ // TODO: We want to generate images in more abstract ways, not just when asked to generate an image @@ -315,6 +330,9 @@ const imageGeneration: Action = { ], } as Action; +/** + * Plugin definition for image generation functionality + */ export const imageGenerationPlugin: Plugin = { name: "imageGeneration", description: "Generate images",