PromtEngineer · Sumit189 · Aug 27, 2024 · Sep 14, 2024 · Sep 14, 2024
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ Welcome to the Voice Assistant project! 🎙️ Our goal is to create a modular
 ## Features 🧰
 
 - **Modular Design**: Easily switch between different models for transcription, response generation, and TTS.
-- **Support for Multiple APIs**: Integrates with OpenAI, Groq, and Deepgram APIs, along with placeholders for local models.
+- **Support for Multiple APIs**: Integrates with OpenAI, Groq, Gemini, and Deepgram APIs, along with placeholders for local models.
 - **Audio Recording and Playback**: Record audio from the microphone and play generated speech.
 - **Configuration Management**: Centralized configuration in `config.py` for easy setup and management.
 
@@ -79,6 +79,7 @@ Create a  `.env` file in the root directory and add your API keys:
 ```shell
     OPENAI_API_KEY=your_openai_api_key
     GROQ_API_KEY=your_groq_api_key
+    GEMINI_API_KEY=your_gemini_api_key
     DEEPGRAM_API_KEY=your_deepgram_api_key
     LOCAL_MODEL_PATH=path/to/local/model
 ```
@@ -90,12 +91,13 @@ Edit config.py to select the models you want to use:
     class Config:
         # Model selection
         TRANSCRIPTION_MODEL = 'groq'  # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local'
-        RESPONSE_MODEL = 'groq'       # Options: 'openai', 'groq', 'ollama', 'local'
+        RESPONSE_MODEL = 'groq'       # Options: 'openai', 'groq', 'ollama', 'gemini', 'local'
         TTS_MODEL = 'deepgram'        # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts'
 
         # API keys and paths
         OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+        GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
         DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
         LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
 ```
@@ -180,6 +182,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
 
 - **OpenAI**: Uses OpenAI's GPT-4 model.
 - **Groq**: Uses Groq's LLaMA model.
+- **Gemini**: Uses Gemini's 1.5 Flash model.
 - **Ollama**: Uses any model served via Ollama.
 - **Local**: Placeholder for a local language model.
 

diff --git a/example.env b/example.env
@@ -1,4 +1,5 @@
 OPENAI_API_KEY="OPENAI_API_KEY"
+GEMINI_API_KEY="GEMINI_API_KEY"
 GROQ_API_KEY="GROQ_API_KEY"
 DEEPGRAM_API_KEY="DEEPGRAM_API_KEY"
 ELEVENLABS_API_KEY="ELEVENLABS_API_KEY"

diff --git a/requirements.txt b/requirements.txt
@@ -19,6 +19,7 @@ SpeechRecognition==3.10.4
 tqdm==4.66.4
 typing_extensions==4.11.0
 urllib3==2.2.1
+google-ai-generativelanguage==0.6.6
 colorama
 deepgram-sdk
 groq

diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
@@ -10,7 +10,8 @@
     },
     "response":{
         "openai":Config.OPENAI_API_KEY,
-        "groq": Config.GROQ_API_KEY
+        "groq": Config.GROQ_API_KEY,
+        "gemini": Config.GEMINI_API_KEY
     },
     "tts": {
         "openai": Config.OPENAI_API_KEY,

diff --git a/voice_assistant/config.py b/voice_assistant/config.py
@@ -22,7 +22,7 @@ class Config:
     """
     # Model selection
     TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi
-    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama
+    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama, gemini
     TTS_MODEL = 'openai'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia
 
     # currently using the MeloTTS for local models. here is how to get started:
@@ -32,10 +32,12 @@ class Config:
     OLLAMA_LLM="llama3:8b"
     GROQ_LLM="llama3-8b-8192"
     OPENAI_LLM="gpt-4o"
+    GEMINI_LLM="gemini-1.5-flash"
 
     # API keys and paths
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
     GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
     DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
     ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
     LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
@@ -58,7 +60,7 @@ def validate_config():
         Config._validate_model('TRANSCRIPTION_MODEL', [
             'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'])
         Config._validate_model('RESPONSE_MODEL', [
-            'openai', 'groq', 'ollama', 'local'])
+            'openai', 'groq', 'ollama', 'gemini', 'local'])
         Config._validate_model('TTS_MODEL', [
             'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local'])
 
@@ -68,6 +70,7 @@ def validate_config():
 
         Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY')
+        Config._validate_api_key('RESPONSE_MODEL', 'gemini', 'GEMINI_API_KEY')
 
         Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')

diff --git a/voice_assistant/response_generation.py b/voice_assistant/response_generation.py
@@ -5,7 +5,7 @@
 from openai import OpenAI
 from groq import Groq
 import ollama
-
+import google.generativeai as genai
 from voice_assistant.config import Config
 
 
@@ -29,6 +29,8 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat
             return _generate_groq_response(api_key, chat_history)
         elif model == 'ollama':
             return _generate_ollama_response(chat_history)
+        elif model == 'gemini':
+            return _generate_gemini_response(chat_history)
         elif model == 'local':
             # Placeholder for local LLM response generation
             return "Generated response from local model"
@@ -61,4 +63,31 @@ def _generate_ollama_response(chat_history):
         model=Config.OLLAMA_LLM,
         messages=chat_history,
     )
-    return response['message']['content']
+    return response['message']['content']
+
+def _generate_gemini_response(chat_history):
+    genai.configure(api_key=Config.GEMINI_API_KEY)
+    model = genai.GenerativeModel('gemini-1.5-flash')
+    # Convert chat history to the required format
+    # The current chat history structure is not compatible with the gemini model
+    # It expects the chat history to be in the format [{"role": "model", "parts": ""}] and [{"role": "user", "parts": ""}]
+    # However, the current chat history is in the format [{"role": "system", "content": ""}] and [{"role": "user", "content": ""}]
+    # To make it compatible, we need to convert the chat history by replacing "content" with "parts"
+    # Iterate over each message in the chat history
+    converted_chat_history = [
+        {"role": "model" if (message["role"] == "system" or message["role"] == "assistant") else message["role"], "parts": message["content"]}
+        for message in chat_history
+    ]
+    # Extract and remove the last user message
+    user_text = ""
+    for message in reversed(converted_chat_history):
+        if message["role"] == "user":
+            converted_chat_history.remove(message)
+            user_text = message["parts"]
+            break
+    # Start a new chat and generate a response
+    chat = model.start_chat(
+        history=converted_chat_history
+    )
+    response = chat.send_message(user_text)
+    return response.text