Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] Integrate Google Gemini API For Response Generation #24

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Welcome to the Voice Assistant project! 🎙️ Our goal is to create a modular
## Features 🧰

- **Modular Design**: Easily switch between different models for transcription, response generation, and TTS.
- **Support for Multiple APIs**: Integrates with OpenAI, Groq, and Deepgram APIs, along with placeholders for local models.
- **Support for Multiple APIs**: Integrates with OpenAI, Groq, Gemini, and Deepgram APIs, along with placeholders for local models.
- **Audio Recording and Playback**: Record audio from the microphone and play generated speech.
- **Configuration Management**: Centralized configuration in `config.py` for easy setup and management.

Expand Down Expand Up @@ -79,6 +79,7 @@ Create a `.env` file in the root directory and add your API keys:
```shell
OPENAI_API_KEY=your_openai_api_key
GROQ_API_KEY=your_groq_api_key
GEMINI_API_KEY=your_gemini_api_key
DEEPGRAM_API_KEY=your_deepgram_api_key
LOCAL_MODEL_PATH=path/to/local/model
```
Expand All @@ -90,12 +91,13 @@ Edit config.py to select the models you want to use:
class Config:
# Model selection
TRANSCRIPTION_MODEL = 'groq' # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local'
RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'local'
RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'gemini', 'local'
TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts'

# API keys and paths
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
```
Expand Down Expand Up @@ -180,6 +182,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the

- **OpenAI**: Uses OpenAI's GPT-4 model.
- **Groq**: Uses Groq's LLaMA model.
- **Gemini**: Uses Gemini's 1.5 Flash model.
- **Ollama**: Uses any model served via Ollama.
- **Local**: Placeholder for a local language model.

Expand Down
1 change: 1 addition & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
OPENAI_API_KEY="OPENAI_API_KEY"
GEMINI_API_KEY="GEMINI_API_KEY"
GROQ_API_KEY="GROQ_API_KEY"
DEEPGRAM_API_KEY="DEEPGRAM_API_KEY"
ELEVENLABS_API_KEY="ELEVENLABS_API_KEY"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ SpeechRecognition==3.10.4
tqdm==4.66.4
typing_extensions==4.11.0
urllib3==2.2.1
google-ai-generativelanguage==0.6.6
colorama
deepgram-sdk
groq
Expand Down
3 changes: 2 additions & 1 deletion voice_assistant/api_key_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
},
"response":{
"openai":Config.OPENAI_API_KEY,
"groq": Config.GROQ_API_KEY
"groq": Config.GROQ_API_KEY,
"gemini": Config.GEMINI_API_KEY
},
"tts": {
"openai": Config.OPENAI_API_KEY,
Expand Down
7 changes: 5 additions & 2 deletions voice_assistant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Config:
"""
# Model selection
TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi
RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama
RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama, gemini
TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia

# currently using the MeloTTS for local models. here is how to get started:
Expand All @@ -32,10 +32,12 @@ class Config:
OLLAMA_LLM="llama3:8b"
GROQ_LLM="llama3-8b-8192"
OPENAI_LLM="gpt-4o"
GEMINI_LLM="gemini-1.5-flash"

# API keys and paths
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
Expand All @@ -58,7 +60,7 @@ def validate_config():
Config._validate_model('TRANSCRIPTION_MODEL', [
'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'])
Config._validate_model('RESPONSE_MODEL', [
'openai', 'groq', 'ollama', 'local'])
'openai', 'groq', 'ollama', 'gemini', 'local'])
Config._validate_model('TTS_MODEL', [
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local'])

Expand All @@ -68,6 +70,7 @@ def validate_config():

Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY')
Config._validate_api_key('RESPONSE_MODEL', 'gemini', 'GEMINI_API_KEY')

Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
Expand Down
33 changes: 31 additions & 2 deletions voice_assistant/response_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from openai import OpenAI
from groq import Groq
import ollama

import google.generativeai as genai
from voice_assistant.config import Config


Expand All @@ -29,6 +29,8 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat
return _generate_groq_response(api_key, chat_history)
elif model == 'ollama':
return _generate_ollama_response(chat_history)
elif model == 'gemini':
return _generate_gemini_response(chat_history)
elif model == 'local':
# Placeholder for local LLM response generation
return "Generated response from local model"
Expand Down Expand Up @@ -61,4 +63,31 @@ def _generate_ollama_response(chat_history):
model=Config.OLLAMA_LLM,
messages=chat_history,
)
return response['message']['content']
return response['message']['content']

def _generate_gemini_response(chat_history):
genai.configure(api_key=Config.GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')
# Convert chat history to the required format
# The current chat history structure is not compatible with the gemini model
# It expects the chat history to be in the format [{"role": "model", "parts": ""}] and [{"role": "user", "parts": ""}]
# However, the current chat history is in the format [{"role": "system", "content": ""}] and [{"role": "user", "content": ""}]
# To make it compatible, we need to convert the chat history by replacing "content" with "parts"
# Iterate over each message in the chat history
converted_chat_history = [
{"role": "model" if (message["role"] == "system" or message["role"] == "assistant") else message["role"], "parts": message["content"]}
for message in chat_history
]
# Extract and remove the last user message
user_text = ""
for message in reversed(converted_chat_history):
if message["role"] == "user":
converted_chat_history.remove(message)
user_text = message["parts"]
break
# Start a new chat and generate a response
chat = model.start_chat(
history=converted_chat_history
)
response = chat.send_message(user_text)
return response.text