Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add image generation capability to Telegram messaging (PR491 Resubmission) #1505

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ IMAGE_GEN= # Set to TRUE to enable image generation
USE_OPENAI_EMBEDDING= # Set to TRUE for OpenAI/1536, leave blank for local
USE_OLLAMA_EMBEDDING= # Set to TRUE for OLLAMA/1024, leave blank for local

#Generation Prompts
SYSTEM_PROMPT= # Leave blank for empty system prompt or defined in character config
IMAGE_GENERATION_PROMPT= # Leave blank for default image generation prompt or defined in character config

# OpenRouter Models
OPENROUTER_MODEL= # Default: uses hermes 70b/405b
SMALL_OPENROUTER_MODEL=
Expand Down Expand Up @@ -187,6 +191,7 @@ TELEGRAM_BOT_TOKEN=

# Together Configuration
TOGETHER_API_KEY=
TOGETHER_IMAGE_MODEL= #Leave blank for default black-forest-labs/FLUX.1-schnell

# Server Configuration
SERVER_PORT=3000
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ tweets/
*.onnx
*.wav
*.mp3
*.png
*.jpg
*.jpeg
*.webp

logs/

Expand Down
3 changes: 3 additions & 0 deletions agent/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ import Database from "better-sqlite3";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import { character } from "./character.ts";
import { imageGenerationPlugin } from "@ai16z/plugin-image-generation";
import type { DirectClient } from "@ai16z/client-direct";
import yargs from "yargs";
import net from "net";

Expand Down
3 changes: 3 additions & 0 deletions mise.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[tools]
node = "23.1.0"
pnpm = "latest"
20 changes: 13 additions & 7 deletions packages/client-telegram/src/messageManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@ import { Context, Telegraf } from "telegraf";
import { composeContext, elizaLogger, ServiceType, composeRandomUser } from "@elizaos/core";
import { getEmbeddingZeroVector } from "@elizaos/core";
import {
composeContext,
ServiceType,
Content,
HandlerCallback,
IAgentRuntime,
getEmbeddingZeroVector,
IImageDescriptionService,
Memory,
ModelClass,
State,
UUID,
Media,
elizaLogger,
stringToUuid,
generateMessageResponse,
generateShouldRespond,
messageCompletionFooter,
shouldRespondFooter,
} from "@elizaos/core";
import { stringToUuid } from "@elizaos/core";

import { generateMessageResponse, generateShouldRespond } from "@elizaos/core";
import { messageCompletionFooter, shouldRespondFooter } from "@elizaos/core";

import { cosineSimilarity } from "./utils";
import {
MESSAGE_CONSTANTS,
Expand Down Expand Up @@ -135,10 +139,11 @@ Note that {{agentName}} is capable of reading/seeing/hearing various forms of me

{{recentMessages}}

# Task: Generate a post/reply in the voice, style and perspective of {{agentName}} (@{{twitterUserName}}) while using the thread of tweets as additional context:
# Task: Generate a reply in the voice and style of {{agentName}}, aka @{{twitterUserName}}
Write a very short reply that is from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the reply. Use the thread of tweets as additional context:
Current Post:
{{currentPost}}
Thread of Tweets You Are Replying To:
Thread of messages you are replying to:

{{formattedConversation}}
` + messageCompletionFooter;
Expand Down Expand Up @@ -681,6 +686,7 @@ export class MessageManager {
content: Content,
replyToMessageId?: number
): Promise<Message.TextMessage[]> {

if (content.attachments && content.attachments.length > 0) {
content.attachments.map(async (attachment: Media) => {
if (attachment.contentType.startsWith("image")) {
Expand Down
9 changes: 5 additions & 4 deletions packages/core/src/generation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,10 @@ export const generateImage = async (
data?: string[];
error?: any;
}> => {
const model = getModel(runtime.imageModelProvider, ModelClass.IMAGE);
const modelSettings = models[runtime.imageModelProvider].imageSettings;
const imageModelProvider =
runtime.character.imageModelProvider ?? runtime.character.modelProvider;
const model = getModel(imageModelProvider, ModelClass.IMAGE);
const modelSettings = models[imageModelProvider].imageSettings;

elizaLogger.info("Generating image with options:", {
imageModelProvider: model,
Expand Down Expand Up @@ -1032,7 +1034,7 @@ export const generateImage = async (
) {
const together = new Together({ apiKey: apiKey as string });
const response = await together.images.create({
model: "black-forest-labs/FLUX.1-schnell",
model: runtime.getSetting("TOGETHER_IMAGE_MODEL") ?? "black-forest-labs/FLUX.1-schnell",
prompt: data.prompt,
width: data.width,
height: data.height,
Expand All @@ -1051,7 +1053,6 @@ export const generateImage = async (
throw new Error("Invalid response format from Together AI");
}

// Rest of the code remains the same...
const base64s = await Promise.all(
togetherResponse.data.map(async (image) => {
if (!image.url) {
Expand Down
5 changes: 4 additions & 1 deletion packages/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -649,11 +649,13 @@ export type Character = {

/** Optional system prompt */
system?: string;
/** Optional image generation prompt */
imageGenerationPrompt?: string;

/** Model provider to use */
modelProvider: ModelProviderName;

/** Image model provider to use, if different from modelProvider */
/** Optional image model provider to use, if different from modelProvider */
imageModelProvider?: ModelProviderName;

/** Optional model endpoint override */
Expand All @@ -666,6 +668,7 @@ export type Character = {
messageHandlerTemplate?: string;
shouldRespondTemplate?: string;
continueMessageHandlerTemplate?: string;
imagePromptTemplate? :string;
evaluationTemplate?: string;
twitterSearchTemplate?: string;
twitterActionTemplate?: string;
Expand Down
144 changes: 106 additions & 38 deletions packages/plugin-image-generation/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,55 @@
import { elizaLogger } from "@elizaos/core";
import {
Action,
composeContext,
generateText,
HandlerCallback,
IAgentRuntime,
Memory,
ModelClass,
Plugin,
State,
//generateCaption,
generateImage,
settings,
} from "@elizaos/core";
import { generateImage } from "@elizaos/core";

import fs from "fs";
import path from "path";
import { validateImageGenConfig } from "./environment";

/**
* Template for generating image descriptions in the agent's voice.
* Uses various context elements like knowledge, bio, and recent posts
* to maintain consistent character voice.
*/
const imagePromptTemplate = `# Knowledge
{{knowledge}}

About {{agentName}}:
{{bio}}
{{lore}}
{{postDirections}}

{{providers}}

{{recentPosts}}

# Task: Generate an image description in the voice and style of {{agentName}} according to the previous <user_message>.
Write a short image description that considers the <user_message> complemented by {{adjective}} about {{topic}} from the perspective of {{agentName}}. Try to write something totally different than previous posts. Do not add commentary or acknowledge this request, just write the description of the image to be generated.
Your response should not contain any questions. Brief, concise statements only. No emojis. Use \\n\\n (double spaces) between statements.`;

/**
* Prompt for the image generation AI to create detailed, high-quality prompts
* that will produce visually appealing images.
*/
const imageGenerationPrompt = "You are an AI assistant specialized in crafting effective prompts for image generation. Your task is to analyze a user's message and create a comprehensive, natural-language prompt that will guide an image generation algorithm to produce high-quality, visually appealing images.\n\nBegin by analyzing the content of the user's message. Follow these steps:\n\n1. List out key elements from the user's message, categorizing them to ensure comprehensive coverage:\n * Topic: The main subject or scene with specific details\n * Material: The medium or style (e.g., digital painting, 3D render)\n * Style: The artistic direction (e.g., fantasy, vaporwave)\n * Artist: Specific artists to influence the visual style\n * Webpage Influence: Art platforms like ArtStation or DeviantArt for quality enhancement\n * Sharpness: Terms like \"sharp focus\" or \"highly detailed\" for clarity\n * Extra Details: Descriptors to enhance atmosphere (e.g., cinematic, dystopian)\n * Shade and Color: Color-related keywords to control mood (e.g., moody lighting)\n * Lighting and Brightness: Specific lighting styles (e.g., dramatic shadows)\n * Camera Angle: Perspective and framing (e.g., close-up, wide shot, aerial view)\n * Composition: Layout guidance (e.g., rule of thirds, centered, dynamic)\n * Time Period: Temporal context if relevant\n * Cultural Elements: Any specific cultural influences\n * Textures: Surface quality descriptions\n * Weather/Atmosphere: Environmental conditions if applicable\n * Negative Prompts: Elements to exclude from the image\n\n2. Brainstorm complementary elements that would enhance the user's vision:\n * Suggest fitting artists and styles if not specified\n * Consider atmospheric elements that would strengthen the concept\n * Identify potential technical aspects that would improve the result\n * Note any elements that should be avoided to maintain the desired look\n\n3. Construct your final prompt by:\n * Leading with the most important scene/subject details from the user's message\n * Incorporating all relevant technical and stylistic elements\n * Grouping related concepts together naturally\n * Maintaining clear, flowing language throughout\n * Adding complementary details that enhance but don't alter the core concept\n * Concluding with negative prompts separated by a \"Negative:\" marker\n\nRemember:\n- Preserve ALL specific details from the user's original message\n- Don't force details into a rigid template\n- Create a cohesive, readable description\n- Keep the focus on the user's core concept while enhancing it with technical and artistic refinements\n\nYour output should contain ONLY the final prompt text, with no additional explanations, tags, or formatting.";

/**
* Saves a base64-encoded image to the local filesystem
* @param base64Data - The base64-encoded image data
* @param filename - Name to use for the saved file (without extension)
* @returns The full filepath where the image was saved
*/
export function saveBase64Image(base64Data: string, filename: string): string {
// Create generatedImages directory if it doesn't exist
const imageDir = path.join(process.cwd(), "generatedImages");
Expand All @@ -35,6 +72,12 @@ export function saveBase64Image(base64Data: string, filename: string): string {
return filepath;
}

/**
* Saves an image from a Heurist URL to the local filesystem
* @param imageUrl - URL of the image to download and save
* @param filename - Name to use for the saved file (without extension)
* @returns Promise resolving to the full filepath where the image was saved
*/
export async function saveHeuristImage(
imageUrl: string,
filename: string
Expand Down Expand Up @@ -62,8 +105,13 @@ export async function saveHeuristImage(
return filepath;
}

/**
* Action definition for image generation capability
* Handles generating images based on user prompts while maintaining agent personality
*/
const imageGeneration: Action = {
name: "GENERATE_IMAGE",
// Alternative action names that should trigger image generation
similes: [
"IMAGE_GENERATION",
"IMAGE_GEN",
Expand Down Expand Up @@ -96,6 +144,14 @@ const imageGeneration: Action = {
veniceApiKeyOk
);
},

/**
* Main handler for image generation:
* 1. Generates an image description in the agent's voice
* 2. Converts that description into an optimized image generation prompt
* 3. Generates the image
* 4. Saves and returns the result
*/
handler: async (
runtime: IAgentRuntime,
message: Memory,
Expand All @@ -115,21 +171,41 @@ const imageGeneration: Action = {
},
callback: HandlerCallback
) => {
elizaLogger.log("Composing state for message:", message);
state = (await runtime.composeState(message)) as State;
const agentContext = composeContext({
state,
template:
runtime.character.templates?.imagePromptTemplate ||
imagePromptTemplate,
});

// Generate the initial prompt in agent's voice
const agentImagePrompt = await generateText({
runtime,
context: `${agentContext}\n\n<user_message>${message.content.text}</user_message>`,
modelClass: ModelClass.SMALL,
});

elizaLogger.log("Agent prompt & caption for image: ", agentImagePrompt);

const userId = runtime.agentId;
elizaLogger.log("User ID:", userId);

const imagePrompt = message.content.text;
elizaLogger.log("Image prompt received:", imagePrompt);
// Convert agent's description into an optimized image generation prompt
const context = runtime.character.system ??
settings.SYSTEM_PROMPT ?? imageGenerationPrompt + `\n\nHere is the user's message:\n<user_message> ${agentImagePrompt} </user_message>`;

const imagePrompt = await generateText({
runtime,
context,
modelClass: ModelClass.SMALL,
});
const imageSettings = runtime.character?.settings?.imageSettings || {};
elizaLogger.log("Image settings:", imageSettings);

// TODO: Generate a prompt for the image

const res: { image: string; caption: string }[] = [];

// Generate the actual image
elizaLogger.log("Generating image with prompt:", imagePrompt);
const images = await generateImage(
{
Expand All @@ -149,6 +225,7 @@ const imageGeneration: Action = {
runtime
);

// Process and save generated images
if (images.success && images.data && images.data.length > 0) {
elizaLogger.log(
"Image generation successful, number of images:",
Expand All @@ -167,46 +244,19 @@ const imageGeneration: Action = {

elizaLogger.log(`Processing image ${i + 1}:`, filename);

//just dont even add a caption or a description just have it generate & send
/*
try {
const imageService = runtime.getService(ServiceType.IMAGE_DESCRIPTION);
if (imageService && typeof imageService.describeImage === 'function') {
const caption = await imageService.describeImage({ imageUrl: filepath });
captionText = caption.description;
captionTitle = caption.title;
}
} catch (error) {
elizaLogger.error("Caption generation failed, using default caption:", error);
}*/

const _caption = "...";
/*= await generateCaption(
{
imageUrl: image,
},
runtime
);*/

res.push({ image: filepath, caption: "..." }); //caption.title });

elizaLogger.log(
`Generated caption for image ${i + 1}:`,
"..." //caption.title
);
//res.push({ image: image, caption: caption.title });
res.push({ image: filepath, caption: agentImagePrompt });

callback(
{
text: "...", //caption.description,
text: agentImagePrompt,
attachments: [
{
id: crypto.randomUUID(),
url: filepath,
title: "Generated image",
source: "imageGeneration",
description: "...", //caption.title,
text: "...", //caption.description,
description: imagePrompt, // caption.title?
text: agentImagePrompt,
contentType: "image/png",
},
],
Expand All @@ -223,6 +273,8 @@ const imageGeneration: Action = {
elizaLogger.error("Image generation failed or returned no data.");
}
},

// Example interactions that should trigger image generation
examples: [
// TODO: We want to generate images in more abstract ways, not just when asked to generate an image

Expand Down Expand Up @@ -291,9 +343,25 @@ const imageGeneration: Action = {
},
},
],
[
{
user: "{{user1}}",
content: { text: "Show me a picture of you" },
},
{
user: "{{agentName}}",
content: {
text: "Here's a picture of me",
action: "GENERATE_IMAGE",
},
},
],
],
} as Action;

/**
* Plugin definition for image generation functionality
*/
export const imageGenerationPlugin: Plugin = {
name: "imageGeneration",
description: "Generate images",
Expand Down
Loading