diff --git a/README.md b/README.md index bf834a5..d6093c0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Welcome to the Voice Assistant project! 🎙️ Our goal is to create a modular ## Features 🧰 - **Modular Design**: Easily switch between different models for transcription, response generation, and TTS. -- **Support for Multiple APIs**: Integrates with OpenAI, Groq, and Deepgram APIs, along with placeholders for local models. +- **Support for Multiple APIs**: Integrates with OpenAI, Groq, Gemini, and Deepgram APIs, along with placeholders for local models. - **Audio Recording and Playback**: Record audio from the microphone and play generated speech. - **Configuration Management**: Centralized configuration in `config.py` for easy setup and management. @@ -79,6 +79,7 @@ Create a `.env` file in the root directory and add your API keys: ```shell OPENAI_API_KEY=your_openai_api_key GROQ_API_KEY=your_groq_api_key + GEMINI_API_KEY=your_gemini_api_key DEEPGRAM_API_KEY=your_deepgram_api_key LOCAL_MODEL_PATH=path/to/local/model ``` @@ -90,12 +91,13 @@ Edit config.py to select the models you want to use: class Config: # Model selection TRANSCRIPTION_MODEL = 'groq' # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local' - RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'local' + RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'gemini', 'local' TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts' # API keys and paths OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") GROQ_API_KEY = os.getenv("GROQ_API_KEY") + GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") ``` @@ -180,6 +182,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the - **OpenAI**: Uses OpenAI's GPT-4 model. - **Groq**: Uses Groq's LLaMA model. +- **Gemini**: Uses Gemini's 1.5 Flash model. - **Ollama**: Uses any model served via Ollama. - **Local**: Placeholder for a local language model. diff --git a/example.env b/example.env index 819de71..d61e862 100644 --- a/example.env +++ b/example.env @@ -1,4 +1,5 @@ OPENAI_API_KEY="OPENAI_API_KEY" +GEMINI_API_KEY="GEMINI_API_KEY" GROQ_API_KEY="GROQ_API_KEY" DEEPGRAM_API_KEY="DEEPGRAM_API_KEY" ELEVENLABS_API_KEY="ELEVENLABS_API_KEY" diff --git a/requirements.txt b/requirements.txt index d533fbb..aa428a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ SpeechRecognition==3.10.4 tqdm==4.66.4 typing_extensions==4.11.0 urllib3==2.2.1 +google-ai-generativelanguage==0.6.6 colorama deepgram-sdk groq diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 68668e3..41b8951 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -10,7 +10,8 @@ }, "response":{ "openai":Config.OPENAI_API_KEY, - "groq": Config.GROQ_API_KEY + "groq": Config.GROQ_API_KEY, + "gemini": Config.GEMINI_API_KEY }, "tts": { "openai": Config.OPENAI_API_KEY, diff --git a/voice_assistant/config.py b/voice_assistant/config.py index f24695c..5264d2b 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -22,7 +22,7 @@ class Config: """ # Model selection TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi - RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama + RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama, gemini TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia # currently using the MeloTTS for local models. here is how to get started: @@ -32,10 +32,12 @@ class Config: OLLAMA_LLM="llama3:8b" GROQ_LLM="llama3-8b-8192" OPENAI_LLM="gpt-4o" + GEMINI_LLM="gemini-1.5-flash" # API keys and paths OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") GROQ_API_KEY = os.getenv("GROQ_API_KEY") + GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") @@ -58,7 +60,7 @@ def validate_config(): Config._validate_model('TRANSCRIPTION_MODEL', [ 'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local']) Config._validate_model('RESPONSE_MODEL', [ - 'openai', 'groq', 'ollama', 'local']) + 'openai', 'groq', 'ollama', 'gemini', 'local']) Config._validate_model('TTS_MODEL', [ 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']) @@ -68,6 +70,7 @@ def validate_config(): Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY') + Config._validate_api_key('RESPONSE_MODEL', 'gemini', 'GEMINI_API_KEY') Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') diff --git a/voice_assistant/response_generation.py b/voice_assistant/response_generation.py index e6e3891..0d91bae 100644 --- a/voice_assistant/response_generation.py +++ b/voice_assistant/response_generation.py @@ -5,7 +5,7 @@ from openai import OpenAI from groq import Groq import ollama - +import google.generativeai as genai from voice_assistant.config import Config @@ -29,6 +29,8 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat return _generate_groq_response(api_key, chat_history) elif model == 'ollama': return _generate_ollama_response(chat_history) + elif model == 'gemini': + return _generate_gemini_response(chat_history) elif model == 'local': # Placeholder for local LLM response generation return "Generated response from local model" @@ -61,4 +63,31 @@ def _generate_ollama_response(chat_history): model=Config.OLLAMA_LLM, messages=chat_history, ) - return response['message']['content'] \ No newline at end of file + return response['message']['content'] + +def _generate_gemini_response(chat_history): + genai.configure(api_key=Config.GEMINI_API_KEY) + model = genai.GenerativeModel('gemini-1.5-flash') + # Convert chat history to the required format + # The current chat history structure is not compatible with the gemini model + # It expects the chat history to be in the format [{"role": "model", "parts": ""}] and [{"role": "user", "parts": ""}] + # However, the current chat history is in the format [{"role": "system", "content": ""}] and [{"role": "user", "content": ""}] + # To make it compatible, we need to convert the chat history by replacing "content" with "parts" + # Iterate over each message in the chat history + converted_chat_history = [ + {"role": "model" if (message["role"] == "system" or message["role"] == "assistant") else message["role"], "parts": message["content"]} + for message in chat_history + ] + # Extract and remove the last user message + user_text = "" + for message in reversed(converted_chat_history): + if message["role"] == "user": + converted_chat_history.remove(message) + user_text = message["parts"] + break + # Start a new chat and generate a response + chat = model.start_chat( + history=converted_chat_history + ) + response = chat.send_message(user_text) + return response.text \ No newline at end of file